mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-01-17 01:30:50 +08:00
509 lines
21 KiB
Python
509 lines
21 KiB
Python
# Copyright (c) OpenMMLab. All rights reserved.
|
||
import copy
|
||
from collections import defaultdict
|
||
from itertools import chain
|
||
|
||
from torch.nn.utils import clip_grad
|
||
|
||
from annotator.uniformer.mmcv.utils import TORCH_VERSION, _BatchNorm, digit_version
|
||
from ..dist_utils import allreduce_grads
|
||
from ..fp16_utils import LossScaler, wrap_fp16_model
|
||
from .hook import HOOKS, Hook
|
||
|
||
try:
|
||
# If PyTorch version >= 1.6.0, torch.cuda.amp.GradScaler would be imported
|
||
# and used; otherwise, auto fp16 will adopt mmcv's implementation.
|
||
from torch.cuda.amp import GradScaler
|
||
except ImportError:
|
||
pass
|
||
|
||
|
||
@HOOKS.register_module()
|
||
class OptimizerHook(Hook):
|
||
|
||
def __init__(self, grad_clip=None):
|
||
self.grad_clip = grad_clip
|
||
|
||
def clip_grads(self, params):
|
||
params = list(
|
||
filter(lambda p: p.requires_grad and p.grad is not None, params))
|
||
if len(params) > 0:
|
||
return clip_grad.clip_grad_norm_(params, **self.grad_clip)
|
||
|
||
def after_train_iter(self, runner):
|
||
runner.optimizer.zero_grad()
|
||
runner.outputs['loss'].backward()
|
||
if self.grad_clip is not None:
|
||
grad_norm = self.clip_grads(runner.model.parameters())
|
||
if grad_norm is not None:
|
||
# Add grad norm to the logger
|
||
runner.log_buffer.update({'grad_norm': float(grad_norm)},
|
||
runner.outputs['num_samples'])
|
||
runner.optimizer.step()
|
||
|
||
|
||
@HOOKS.register_module()
|
||
class GradientCumulativeOptimizerHook(OptimizerHook):
|
||
"""Optimizer Hook implements multi-iters gradient cumulating.
|
||
|
||
Args:
|
||
cumulative_iters (int, optional): Num of gradient cumulative iters.
|
||
The optimizer will step every `cumulative_iters` iters.
|
||
Defaults to 1.
|
||
|
||
Examples:
|
||
>>> # Use cumulative_iters to simulate a large batch size
|
||
>>> # It is helpful when the hardware cannot handle a large batch size.
|
||
>>> loader = DataLoader(data, batch_size=64)
|
||
>>> optim_hook = GradientCumulativeOptimizerHook(cumulative_iters=4)
|
||
>>> # almost equals to
|
||
>>> loader = DataLoader(data, batch_size=256)
|
||
>>> optim_hook = OptimizerHook()
|
||
"""
|
||
|
||
def __init__(self, cumulative_iters=1, **kwargs):
|
||
super(GradientCumulativeOptimizerHook, self).__init__(**kwargs)
|
||
|
||
assert isinstance(cumulative_iters, int) and cumulative_iters > 0, \
|
||
f'cumulative_iters only accepts positive int, but got ' \
|
||
f'{type(cumulative_iters)} instead.'
|
||
|
||
self.cumulative_iters = cumulative_iters
|
||
self.divisible_iters = 0
|
||
self.remainder_iters = 0
|
||
self.initialized = False
|
||
|
||
def has_batch_norm(self, module):
|
||
if isinstance(module, _BatchNorm):
|
||
return True
|
||
for m in module.children():
|
||
if self.has_batch_norm(m):
|
||
return True
|
||
return False
|
||
|
||
def _init(self, runner):
|
||
if runner.iter % self.cumulative_iters != 0:
|
||
runner.logger.warning(
|
||
'Resume iter number is not divisible by cumulative_iters in '
|
||
'GradientCumulativeOptimizerHook, which means the gradient of '
|
||
'some iters is lost and the result may be influenced slightly.'
|
||
)
|
||
|
||
if self.has_batch_norm(runner.model) and self.cumulative_iters > 1:
|
||
runner.logger.warning(
|
||
'GradientCumulativeOptimizerHook may slightly decrease '
|
||
'performance if the model has BatchNorm layers.')
|
||
|
||
residual_iters = runner.max_iters - runner.iter
|
||
|
||
self.divisible_iters = (
|
||
residual_iters // self.cumulative_iters * self.cumulative_iters)
|
||
self.remainder_iters = residual_iters - self.divisible_iters
|
||
|
||
self.initialized = True
|
||
|
||
def after_train_iter(self, runner):
|
||
if not self.initialized:
|
||
self._init(runner)
|
||
|
||
if runner.iter < self.divisible_iters:
|
||
loss_factor = self.cumulative_iters
|
||
else:
|
||
loss_factor = self.remainder_iters
|
||
loss = runner.outputs['loss']
|
||
loss = loss / loss_factor
|
||
loss.backward()
|
||
|
||
if (self.every_n_iters(runner, self.cumulative_iters)
|
||
or self.is_last_iter(runner)):
|
||
|
||
if self.grad_clip is not None:
|
||
grad_norm = self.clip_grads(runner.model.parameters())
|
||
if grad_norm is not None:
|
||
# Add grad norm to the logger
|
||
runner.log_buffer.update({'grad_norm': float(grad_norm)},
|
||
runner.outputs['num_samples'])
|
||
runner.optimizer.step()
|
||
runner.optimizer.zero_grad()
|
||
|
||
|
||
if (TORCH_VERSION != 'parrots'
|
||
and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
|
||
|
||
@HOOKS.register_module()
|
||
class Fp16OptimizerHook(OptimizerHook):
|
||
"""FP16 optimizer hook (using PyTorch's implementation).
|
||
|
||
If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
|
||
to take care of the optimization procedure.
|
||
|
||
Args:
|
||
loss_scale (float | str | dict): Scale factor configuration.
|
||
If loss_scale is a float, static loss scaling will be used with
|
||
the specified scale. If loss_scale is a string, it must be
|
||
'dynamic', then dynamic loss scaling will be used.
|
||
It can also be a dict containing arguments of GradScalar.
|
||
Defaults to 512. For Pytorch >= 1.6, mmcv uses official
|
||
implementation of GradScaler. If you use a dict version of
|
||
loss_scale to create GradScaler, please refer to:
|
||
https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler
|
||
for the parameters.
|
||
|
||
Examples:
|
||
>>> loss_scale = dict(
|
||
... init_scale=65536.0,
|
||
... growth_factor=2.0,
|
||
... backoff_factor=0.5,
|
||
... growth_interval=2000
|
||
... )
|
||
>>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale)
|
||
"""
|
||
|
||
def __init__(self,
|
||
grad_clip=None,
|
||
coalesce=True,
|
||
bucket_size_mb=-1,
|
||
loss_scale=512.,
|
||
distributed=True):
|
||
self.grad_clip = grad_clip
|
||
self.coalesce = coalesce
|
||
self.bucket_size_mb = bucket_size_mb
|
||
self.distributed = distributed
|
||
self._scale_update_param = None
|
||
if loss_scale == 'dynamic':
|
||
self.loss_scaler = GradScaler()
|
||
elif isinstance(loss_scale, float):
|
||
self._scale_update_param = loss_scale
|
||
self.loss_scaler = GradScaler(init_scale=loss_scale)
|
||
elif isinstance(loss_scale, dict):
|
||
self.loss_scaler = GradScaler(**loss_scale)
|
||
else:
|
||
raise ValueError('loss_scale must be of type float, dict, or '
|
||
f'"dynamic", got {loss_scale}')
|
||
|
||
def before_run(self, runner):
|
||
"""Preparing steps before Mixed Precision Training."""
|
||
# wrap model mode to fp16
|
||
wrap_fp16_model(runner.model)
|
||
# resume from state dict
|
||
if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
|
||
scaler_state_dict = runner.meta['fp16']['loss_scaler']
|
||
self.loss_scaler.load_state_dict(scaler_state_dict)
|
||
|
||
def copy_grads_to_fp32(self, fp16_net, fp32_weights):
|
||
"""Copy gradients from fp16 model to fp32 weight copy."""
|
||
for fp32_param, fp16_param in zip(fp32_weights,
|
||
fp16_net.parameters()):
|
||
if fp16_param.grad is not None:
|
||
if fp32_param.grad is None:
|
||
fp32_param.grad = fp32_param.data.new(
|
||
fp32_param.size())
|
||
fp32_param.grad.copy_(fp16_param.grad)
|
||
|
||
def copy_params_to_fp16(self, fp16_net, fp32_weights):
|
||
"""Copy updated params from fp32 weight copy to fp16 model."""
|
||
for fp16_param, fp32_param in zip(fp16_net.parameters(),
|
||
fp32_weights):
|
||
fp16_param.data.copy_(fp32_param.data)
|
||
|
||
def after_train_iter(self, runner):
|
||
"""Backward optimization steps for Mixed Precision Training. For
|
||
dynamic loss scaling, please refer to
|
||
https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.
|
||
|
||
1. Scale the loss by a scale factor.
|
||
2. Backward the loss to obtain the gradients.
|
||
3. Unscale the optimizer’s gradient tensors.
|
||
4. Call optimizer.step() and update scale factor.
|
||
5. Save loss_scaler state_dict for resume purpose.
|
||
"""
|
||
# clear grads of last iteration
|
||
runner.model.zero_grad()
|
||
runner.optimizer.zero_grad()
|
||
|
||
self.loss_scaler.scale(runner.outputs['loss']).backward()
|
||
self.loss_scaler.unscale_(runner.optimizer)
|
||
# grad clip
|
||
if self.grad_clip is not None:
|
||
grad_norm = self.clip_grads(runner.model.parameters())
|
||
if grad_norm is not None:
|
||
# Add grad norm to the logger
|
||
runner.log_buffer.update({'grad_norm': float(grad_norm)},
|
||
runner.outputs['num_samples'])
|
||
# backward and update scaler
|
||
self.loss_scaler.step(runner.optimizer)
|
||
self.loss_scaler.update(self._scale_update_param)
|
||
|
||
# save state_dict of loss_scaler
|
||
runner.meta.setdefault(
|
||
'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
|
||
|
||
@HOOKS.register_module()
|
||
class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
|
||
Fp16OptimizerHook):
|
||
"""Fp16 optimizer Hook (using PyTorch's implementation) implements
|
||
multi-iters gradient cumulating.
|
||
|
||
If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
|
||
to take care of the optimization procedure.
|
||
"""
|
||
|
||
def __init__(self, *args, **kwargs):
|
||
super(GradientCumulativeFp16OptimizerHook,
|
||
self).__init__(*args, **kwargs)
|
||
|
||
def after_train_iter(self, runner):
|
||
if not self.initialized:
|
||
self._init(runner)
|
||
|
||
if runner.iter < self.divisible_iters:
|
||
loss_factor = self.cumulative_iters
|
||
else:
|
||
loss_factor = self.remainder_iters
|
||
loss = runner.outputs['loss']
|
||
loss = loss / loss_factor
|
||
|
||
self.loss_scaler.scale(loss).backward()
|
||
|
||
if (self.every_n_iters(runner, self.cumulative_iters)
|
||
or self.is_last_iter(runner)):
|
||
|
||
# copy fp16 grads in the model to fp32 params in the optimizer
|
||
self.loss_scaler.unscale_(runner.optimizer)
|
||
|
||
if self.grad_clip is not None:
|
||
grad_norm = self.clip_grads(runner.model.parameters())
|
||
if grad_norm is not None:
|
||
# Add grad norm to the logger
|
||
runner.log_buffer.update(
|
||
{'grad_norm': float(grad_norm)},
|
||
runner.outputs['num_samples'])
|
||
|
||
# backward and update scaler
|
||
self.loss_scaler.step(runner.optimizer)
|
||
self.loss_scaler.update(self._scale_update_param)
|
||
|
||
# save state_dict of loss_scaler
|
||
runner.meta.setdefault(
|
||
'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
|
||
|
||
# clear grads
|
||
runner.model.zero_grad()
|
||
runner.optimizer.zero_grad()
|
||
|
||
else:
|
||
|
||
@HOOKS.register_module()
|
||
class Fp16OptimizerHook(OptimizerHook):
|
||
"""FP16 optimizer hook (mmcv's implementation).
|
||
|
||
The steps of fp16 optimizer is as follows.
|
||
1. Scale the loss value.
|
||
2. BP in the fp16 model.
|
||
2. Copy gradients from fp16 model to fp32 weights.
|
||
3. Update fp32 weights.
|
||
4. Copy updated parameters from fp32 weights to fp16 model.
|
||
|
||
Refer to https://arxiv.org/abs/1710.03740 for more details.
|
||
|
||
Args:
|
||
loss_scale (float | str | dict): Scale factor configuration.
|
||
If loss_scale is a float, static loss scaling will be used with
|
||
the specified scale. If loss_scale is a string, it must be
|
||
'dynamic', then dynamic loss scaling will be used.
|
||
It can also be a dict containing arguments of LossScaler.
|
||
Defaults to 512.
|
||
"""
|
||
|
||
def __init__(self,
|
||
grad_clip=None,
|
||
coalesce=True,
|
||
bucket_size_mb=-1,
|
||
loss_scale=512.,
|
||
distributed=True):
|
||
self.grad_clip = grad_clip
|
||
self.coalesce = coalesce
|
||
self.bucket_size_mb = bucket_size_mb
|
||
self.distributed = distributed
|
||
if loss_scale == 'dynamic':
|
||
self.loss_scaler = LossScaler(mode='dynamic')
|
||
elif isinstance(loss_scale, float):
|
||
self.loss_scaler = LossScaler(
|
||
init_scale=loss_scale, mode='static')
|
||
elif isinstance(loss_scale, dict):
|
||
self.loss_scaler = LossScaler(**loss_scale)
|
||
else:
|
||
raise ValueError('loss_scale must be of type float, dict, or '
|
||
f'"dynamic", got {loss_scale}')
|
||
|
||
def before_run(self, runner):
|
||
"""Preparing steps before Mixed Precision Training.
|
||
|
||
1. Make a master copy of fp32 weights for optimization.
|
||
2. Convert the main model from fp32 to fp16.
|
||
"""
|
||
# keep a copy of fp32 weights
|
||
old_groups = runner.optimizer.param_groups
|
||
runner.optimizer.param_groups = copy.deepcopy(
|
||
runner.optimizer.param_groups)
|
||
state = defaultdict(dict)
|
||
p_map = {
|
||
old_p: p
|
||
for old_p, p in zip(
|
||
chain(*(g['params'] for g in old_groups)),
|
||
chain(*(g['params']
|
||
for g in runner.optimizer.param_groups)))
|
||
}
|
||
for k, v in runner.optimizer.state.items():
|
||
state[p_map[k]] = v
|
||
runner.optimizer.state = state
|
||
# convert model to fp16
|
||
wrap_fp16_model(runner.model)
|
||
# resume from state dict
|
||
if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
|
||
scaler_state_dict = runner.meta['fp16']['loss_scaler']
|
||
self.loss_scaler.load_state_dict(scaler_state_dict)
|
||
|
||
def copy_grads_to_fp32(self, fp16_net, fp32_weights):
|
||
"""Copy gradients from fp16 model to fp32 weight copy."""
|
||
for fp32_param, fp16_param in zip(fp32_weights,
|
||
fp16_net.parameters()):
|
||
if fp16_param.grad is not None:
|
||
if fp32_param.grad is None:
|
||
fp32_param.grad = fp32_param.data.new(
|
||
fp32_param.size())
|
||
fp32_param.grad.copy_(fp16_param.grad)
|
||
|
||
def copy_params_to_fp16(self, fp16_net, fp32_weights):
|
||
"""Copy updated params from fp32 weight copy to fp16 model."""
|
||
for fp16_param, fp32_param in zip(fp16_net.parameters(),
|
||
fp32_weights):
|
||
fp16_param.data.copy_(fp32_param.data)
|
||
|
||
def after_train_iter(self, runner):
|
||
"""Backward optimization steps for Mixed Precision Training. For
|
||
dynamic loss scaling, please refer `loss_scalar.py`
|
||
|
||
1. Scale the loss by a scale factor.
|
||
2. Backward the loss to obtain the gradients (fp16).
|
||
3. Copy gradients from the model to the fp32 weight copy.
|
||
4. Scale the gradients back and update the fp32 weight copy.
|
||
5. Copy back the params from fp32 weight copy to the fp16 model.
|
||
6. Save loss_scaler state_dict for resume purpose.
|
||
"""
|
||
# clear grads of last iteration
|
||
runner.model.zero_grad()
|
||
runner.optimizer.zero_grad()
|
||
# scale the loss value
|
||
scaled_loss = runner.outputs['loss'] * self.loss_scaler.loss_scale
|
||
scaled_loss.backward()
|
||
# copy fp16 grads in the model to fp32 params in the optimizer
|
||
|
||
fp32_weights = []
|
||
for param_group in runner.optimizer.param_groups:
|
||
fp32_weights += param_group['params']
|
||
self.copy_grads_to_fp32(runner.model, fp32_weights)
|
||
# allreduce grads
|
||
if self.distributed:
|
||
allreduce_grads(fp32_weights, self.coalesce,
|
||
self.bucket_size_mb)
|
||
|
||
has_overflow = self.loss_scaler.has_overflow(fp32_weights)
|
||
# if has overflow, skip this iteration
|
||
if not has_overflow:
|
||
# scale the gradients back
|
||
for param in fp32_weights:
|
||
if param.grad is not None:
|
||
param.grad.div_(self.loss_scaler.loss_scale)
|
||
if self.grad_clip is not None:
|
||
grad_norm = self.clip_grads(fp32_weights)
|
||
if grad_norm is not None:
|
||
# Add grad norm to the logger
|
||
runner.log_buffer.update(
|
||
{'grad_norm': float(grad_norm)},
|
||
runner.outputs['num_samples'])
|
||
# update fp32 params
|
||
runner.optimizer.step()
|
||
# copy fp32 params to the fp16 model
|
||
self.copy_params_to_fp16(runner.model, fp32_weights)
|
||
self.loss_scaler.update_scale(has_overflow)
|
||
if has_overflow:
|
||
runner.logger.warning('Check overflow, downscale loss scale '
|
||
f'to {self.loss_scaler.cur_scale}')
|
||
|
||
# save state_dict of loss_scaler
|
||
runner.meta.setdefault(
|
||
'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
|
||
|
||
@HOOKS.register_module()
|
||
class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
|
||
Fp16OptimizerHook):
|
||
"""Fp16 optimizer Hook (using mmcv implementation) implements multi-
|
||
iters gradient cumulating."""
|
||
|
||
def __init__(self, *args, **kwargs):
|
||
super(GradientCumulativeFp16OptimizerHook,
|
||
self).__init__(*args, **kwargs)
|
||
|
||
def after_train_iter(self, runner):
|
||
if not self.initialized:
|
||
self._init(runner)
|
||
|
||
if runner.iter < self.divisible_iters:
|
||
loss_factor = self.cumulative_iters
|
||
else:
|
||
loss_factor = self.remainder_iters
|
||
|
||
loss = runner.outputs['loss']
|
||
loss = loss / loss_factor
|
||
|
||
# scale the loss value
|
||
scaled_loss = loss * self.loss_scaler.loss_scale
|
||
scaled_loss.backward()
|
||
|
||
if (self.every_n_iters(runner, self.cumulative_iters)
|
||
or self.is_last_iter(runner)):
|
||
|
||
# copy fp16 grads in the model to fp32 params in the optimizer
|
||
fp32_weights = []
|
||
for param_group in runner.optimizer.param_groups:
|
||
fp32_weights += param_group['params']
|
||
self.copy_grads_to_fp32(runner.model, fp32_weights)
|
||
# allreduce grads
|
||
if self.distributed:
|
||
allreduce_grads(fp32_weights, self.coalesce,
|
||
self.bucket_size_mb)
|
||
|
||
has_overflow = self.loss_scaler.has_overflow(fp32_weights)
|
||
# if has overflow, skip this iteration
|
||
if not has_overflow:
|
||
# scale the gradients back
|
||
for param in fp32_weights:
|
||
if param.grad is not None:
|
||
param.grad.div_(self.loss_scaler.loss_scale)
|
||
if self.grad_clip is not None:
|
||
grad_norm = self.clip_grads(fp32_weights)
|
||
if grad_norm is not None:
|
||
# Add grad norm to the logger
|
||
runner.log_buffer.update(
|
||
{'grad_norm': float(grad_norm)},
|
||
runner.outputs['num_samples'])
|
||
# update fp32 params
|
||
runner.optimizer.step()
|
||
# copy fp32 params to the fp16 model
|
||
self.copy_params_to_fp16(runner.model, fp32_weights)
|
||
else:
|
||
runner.logger.warning(
|
||
'Check overflow, downscale loss scale '
|
||
f'to {self.loss_scaler.cur_scale}')
|
||
|
||
self.loss_scaler.update_scale(has_overflow)
|
||
|
||
# save state_dict of loss_scaler
|
||
runner.meta.setdefault(
|
||
'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
|
||
|
||
# clear grads
|
||
runner.model.zero_grad()
|
||
runner.optimizer.zero_grad()
|