mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-01-09 13:50:49 +08:00
Add ControlNet preprocessors
This commit is contained in:
parent
fa66ece26b
commit
96a0804a11
6
comfy/annotator/canny/__init__.py
Normal file
6
comfy/annotator/canny/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
import cv2
|
||||
|
||||
|
||||
class CannyDetector:
|
||||
def __call__(self, img, low_threshold, high_threshold):
|
||||
return cv2.Canny(img, low_threshold, high_threshold)
|
||||
132
comfy/annotator/hed/__init__.py
Normal file
132
comfy/annotator/hed/__init__.py
Normal file
@ -0,0 +1,132 @@
|
||||
import numpy as np
|
||||
import cv2
|
||||
import os
|
||||
import torch
|
||||
from einops import rearrange
|
||||
from annotator.util import annotator_ckpts_path
|
||||
|
||||
|
||||
class Network(torch.nn.Module):
|
||||
def __init__(self, model_path):
|
||||
super().__init__()
|
||||
|
||||
self.netVggOne = torch.nn.Sequential(
|
||||
torch.nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1),
|
||||
torch.nn.ReLU(inplace=False),
|
||||
torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
|
||||
torch.nn.ReLU(inplace=False)
|
||||
)
|
||||
|
||||
self.netVggTwo = torch.nn.Sequential(
|
||||
torch.nn.MaxPool2d(kernel_size=2, stride=2),
|
||||
torch.nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
|
||||
torch.nn.ReLU(inplace=False),
|
||||
torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
|
||||
torch.nn.ReLU(inplace=False)
|
||||
)
|
||||
|
||||
self.netVggThr = torch.nn.Sequential(
|
||||
torch.nn.MaxPool2d(kernel_size=2, stride=2),
|
||||
torch.nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
|
||||
torch.nn.ReLU(inplace=False),
|
||||
torch.nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
|
||||
torch.nn.ReLU(inplace=False),
|
||||
torch.nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
|
||||
torch.nn.ReLU(inplace=False)
|
||||
)
|
||||
|
||||
self.netVggFou = torch.nn.Sequential(
|
||||
torch.nn.MaxPool2d(kernel_size=2, stride=2),
|
||||
torch.nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
|
||||
torch.nn.ReLU(inplace=False),
|
||||
torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
|
||||
torch.nn.ReLU(inplace=False),
|
||||
torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
|
||||
torch.nn.ReLU(inplace=False)
|
||||
)
|
||||
|
||||
self.netVggFiv = torch.nn.Sequential(
|
||||
torch.nn.MaxPool2d(kernel_size=2, stride=2),
|
||||
torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
|
||||
torch.nn.ReLU(inplace=False),
|
||||
torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
|
||||
torch.nn.ReLU(inplace=False),
|
||||
torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
|
||||
torch.nn.ReLU(inplace=False)
|
||||
)
|
||||
|
||||
self.netScoreOne = torch.nn.Conv2d(in_channels=64, out_channels=1, kernel_size=1, stride=1, padding=0)
|
||||
self.netScoreTwo = torch.nn.Conv2d(in_channels=128, out_channels=1, kernel_size=1, stride=1, padding=0)
|
||||
self.netScoreThr = torch.nn.Conv2d(in_channels=256, out_channels=1, kernel_size=1, stride=1, padding=0)
|
||||
self.netScoreFou = torch.nn.Conv2d(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
|
||||
self.netScoreFiv = torch.nn.Conv2d(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
|
||||
|
||||
self.netCombine = torch.nn.Sequential(
|
||||
torch.nn.Conv2d(in_channels=5, out_channels=1, kernel_size=1, stride=1, padding=0),
|
||||
torch.nn.Sigmoid()
|
||||
)
|
||||
|
||||
self.load_state_dict({strKey.replace('module', 'net'): tenWeight for strKey, tenWeight in torch.load(model_path).items()})
|
||||
|
||||
def forward(self, tenInput):
|
||||
tenInput = tenInput * 255.0
|
||||
tenInput = tenInput - torch.tensor(data=[104.00698793, 116.66876762, 122.67891434], dtype=tenInput.dtype, device=tenInput.device).view(1, 3, 1, 1)
|
||||
|
||||
tenVggOne = self.netVggOne(tenInput)
|
||||
tenVggTwo = self.netVggTwo(tenVggOne)
|
||||
tenVggThr = self.netVggThr(tenVggTwo)
|
||||
tenVggFou = self.netVggFou(tenVggThr)
|
||||
tenVggFiv = self.netVggFiv(tenVggFou)
|
||||
|
||||
tenScoreOne = self.netScoreOne(tenVggOne)
|
||||
tenScoreTwo = self.netScoreTwo(tenVggTwo)
|
||||
tenScoreThr = self.netScoreThr(tenVggThr)
|
||||
tenScoreFou = self.netScoreFou(tenVggFou)
|
||||
tenScoreFiv = self.netScoreFiv(tenVggFiv)
|
||||
|
||||
tenScoreOne = torch.nn.functional.interpolate(input=tenScoreOne, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
|
||||
tenScoreTwo = torch.nn.functional.interpolate(input=tenScoreTwo, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
|
||||
tenScoreThr = torch.nn.functional.interpolate(input=tenScoreThr, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
|
||||
tenScoreFou = torch.nn.functional.interpolate(input=tenScoreFou, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
|
||||
tenScoreFiv = torch.nn.functional.interpolate(input=tenScoreFiv, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
|
||||
|
||||
return self.netCombine(torch.cat([ tenScoreOne, tenScoreTwo, tenScoreThr, tenScoreFou, tenScoreFiv ], 1))
|
||||
|
||||
|
||||
class HEDdetector:
|
||||
def __init__(self):
|
||||
remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth"
|
||||
modelpath = os.path.join(annotator_ckpts_path, "network-bsds500.pth")
|
||||
if not os.path.exists(modelpath):
|
||||
from basicsr.utils.download_util import load_file_from_url
|
||||
load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
|
||||
self.netNetwork = Network(modelpath).cuda().eval()
|
||||
|
||||
def __call__(self, input_image):
|
||||
assert input_image.ndim == 3
|
||||
input_image = input_image[:, :, ::-1].copy()
|
||||
with torch.no_grad():
|
||||
image_hed = torch.from_numpy(input_image).float().cuda()
|
||||
image_hed = image_hed / 255.0
|
||||
image_hed = rearrange(image_hed, 'h w c -> 1 c h w')
|
||||
edge = self.netNetwork(image_hed)[0]
|
||||
edge = (edge.cpu().numpy() * 255.0).clip(0, 255).astype(np.uint8)
|
||||
return edge[0]
|
||||
|
||||
|
||||
def nms(x, t, s):
|
||||
x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
|
||||
|
||||
f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
|
||||
f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
|
||||
f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
|
||||
f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
|
||||
|
||||
y = np.zeros_like(x)
|
||||
|
||||
for f in [f1, f2, f3, f4]:
|
||||
np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
|
||||
|
||||
z = np.zeros_like(y, dtype=np.uint8)
|
||||
z[y > t] = 255
|
||||
return z
|
||||
38
comfy/annotator/midas/__init__.py
Normal file
38
comfy/annotator/midas/__init__.py
Normal file
@ -0,0 +1,38 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from einops import rearrange
|
||||
from .api import MiDaSInference
|
||||
|
||||
|
||||
class MidasDetector:
|
||||
def __init__(self):
|
||||
self.model = MiDaSInference(model_type="dpt_hybrid").cuda()
|
||||
|
||||
def __call__(self, input_image, a=np.pi * 2.0, bg_th=0.1):
|
||||
assert input_image.ndim == 3
|
||||
image_depth = input_image
|
||||
with torch.no_grad():
|
||||
image_depth = torch.from_numpy(image_depth).float().cuda()
|
||||
image_depth = image_depth / 127.5 - 1.0
|
||||
image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
|
||||
depth = self.model(image_depth)[0]
|
||||
|
||||
depth_pt = depth.clone()
|
||||
depth_pt -= torch.min(depth_pt)
|
||||
depth_pt /= torch.max(depth_pt)
|
||||
depth_pt = depth_pt.cpu().numpy()
|
||||
depth_image = (depth_pt * 255.0).clip(0, 255).astype(np.uint8)
|
||||
|
||||
depth_np = depth.cpu().numpy()
|
||||
x = cv2.Sobel(depth_np, cv2.CV_32F, 1, 0, ksize=3)
|
||||
y = cv2.Sobel(depth_np, cv2.CV_32F, 0, 1, ksize=3)
|
||||
z = np.ones_like(x) * a
|
||||
x[depth_pt < bg_th] = 0
|
||||
y[depth_pt < bg_th] = 0
|
||||
normal = np.stack([x, y, z], axis=2)
|
||||
normal /= np.sum(normal ** 2.0, axis=2, keepdims=True) ** 0.5
|
||||
normal_image = (normal * 127.5 + 127.5).clip(0, 255).astype(np.uint8)
|
||||
|
||||
return depth_image, normal_image
|
||||
169
comfy/annotator/midas/api.py
Normal file
169
comfy/annotator/midas/api.py
Normal file
@ -0,0 +1,169 @@
|
||||
# based on https://github.com/isl-org/MiDaS
|
||||
|
||||
import cv2
|
||||
import os
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torchvision.transforms import Compose
|
||||
|
||||
from .midas.dpt_depth import DPTDepthModel
|
||||
from .midas.midas_net import MidasNet
|
||||
from .midas.midas_net_custom import MidasNet_small
|
||||
from .midas.transforms import Resize, NormalizeImage, PrepareForNet
|
||||
from annotator.util import annotator_ckpts_path
|
||||
|
||||
|
||||
ISL_PATHS = {
|
||||
"dpt_large": os.path.join(annotator_ckpts_path, "dpt_large-midas-2f21e586.pt"),
|
||||
"dpt_hybrid": os.path.join(annotator_ckpts_path, "dpt_hybrid-midas-501f0c75.pt"),
|
||||
"midas_v21": "",
|
||||
"midas_v21_small": "",
|
||||
}
|
||||
|
||||
remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt"
|
||||
|
||||
|
||||
def disabled_train(self, mode=True):
|
||||
"""Overwrite model.train with this function to make sure train/eval mode
|
||||
does not change anymore."""
|
||||
return self
|
||||
|
||||
|
||||
def load_midas_transform(model_type):
|
||||
# https://github.com/isl-org/MiDaS/blob/master/run.py
|
||||
# load transform only
|
||||
if model_type == "dpt_large": # DPT-Large
|
||||
net_w, net_h = 384, 384
|
||||
resize_mode = "minimal"
|
||||
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||
|
||||
elif model_type == "dpt_hybrid": # DPT-Hybrid
|
||||
net_w, net_h = 384, 384
|
||||
resize_mode = "minimal"
|
||||
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||
|
||||
elif model_type == "midas_v21":
|
||||
net_w, net_h = 384, 384
|
||||
resize_mode = "upper_bound"
|
||||
normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
||||
|
||||
elif model_type == "midas_v21_small":
|
||||
net_w, net_h = 256, 256
|
||||
resize_mode = "upper_bound"
|
||||
normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
||||
|
||||
else:
|
||||
assert False, f"model_type '{model_type}' not implemented, use: --model_type large"
|
||||
|
||||
transform = Compose(
|
||||
[
|
||||
Resize(
|
||||
net_w,
|
||||
net_h,
|
||||
resize_target=None,
|
||||
keep_aspect_ratio=True,
|
||||
ensure_multiple_of=32,
|
||||
resize_method=resize_mode,
|
||||
image_interpolation_method=cv2.INTER_CUBIC,
|
||||
),
|
||||
normalization,
|
||||
PrepareForNet(),
|
||||
]
|
||||
)
|
||||
|
||||
return transform
|
||||
|
||||
|
||||
def load_model(model_type):
|
||||
# https://github.com/isl-org/MiDaS/blob/master/run.py
|
||||
# load network
|
||||
model_path = ISL_PATHS[model_type]
|
||||
if model_type == "dpt_large": # DPT-Large
|
||||
model = DPTDepthModel(
|
||||
path=model_path,
|
||||
backbone="vitl16_384",
|
||||
non_negative=True,
|
||||
)
|
||||
net_w, net_h = 384, 384
|
||||
resize_mode = "minimal"
|
||||
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||
|
||||
elif model_type == "dpt_hybrid": # DPT-Hybrid
|
||||
if not os.path.exists(model_path):
|
||||
from basicsr.utils.download_util import load_file_from_url
|
||||
load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
|
||||
|
||||
model = DPTDepthModel(
|
||||
path=model_path,
|
||||
backbone="vitb_rn50_384",
|
||||
non_negative=True,
|
||||
)
|
||||
net_w, net_h = 384, 384
|
||||
resize_mode = "minimal"
|
||||
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||
|
||||
elif model_type == "midas_v21":
|
||||
model = MidasNet(model_path, non_negative=True)
|
||||
net_w, net_h = 384, 384
|
||||
resize_mode = "upper_bound"
|
||||
normalization = NormalizeImage(
|
||||
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
||||
)
|
||||
|
||||
elif model_type == "midas_v21_small":
|
||||
model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True,
|
||||
non_negative=True, blocks={'expand': True})
|
||||
net_w, net_h = 256, 256
|
||||
resize_mode = "upper_bound"
|
||||
normalization = NormalizeImage(
|
||||
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
||||
)
|
||||
|
||||
else:
|
||||
print(f"model_type '{model_type}' not implemented, use: --model_type large")
|
||||
assert False
|
||||
|
||||
transform = Compose(
|
||||
[
|
||||
Resize(
|
||||
net_w,
|
||||
net_h,
|
||||
resize_target=None,
|
||||
keep_aspect_ratio=True,
|
||||
ensure_multiple_of=32,
|
||||
resize_method=resize_mode,
|
||||
image_interpolation_method=cv2.INTER_CUBIC,
|
||||
),
|
||||
normalization,
|
||||
PrepareForNet(),
|
||||
]
|
||||
)
|
||||
|
||||
return model.eval(), transform
|
||||
|
||||
|
||||
class MiDaSInference(nn.Module):
|
||||
MODEL_TYPES_TORCH_HUB = [
|
||||
"DPT_Large",
|
||||
"DPT_Hybrid",
|
||||
"MiDaS_small"
|
||||
]
|
||||
MODEL_TYPES_ISL = [
|
||||
"dpt_large",
|
||||
"dpt_hybrid",
|
||||
"midas_v21",
|
||||
"midas_v21_small",
|
||||
]
|
||||
|
||||
def __init__(self, model_type):
|
||||
super().__init__()
|
||||
assert (model_type in self.MODEL_TYPES_ISL)
|
||||
model, _ = load_model(model_type)
|
||||
self.model = model
|
||||
self.model.train = disabled_train
|
||||
|
||||
def forward(self, x):
|
||||
with torch.no_grad():
|
||||
prediction = self.model(x)
|
||||
return prediction
|
||||
|
||||
0
comfy/annotator/midas/midas/__init__.py
Normal file
0
comfy/annotator/midas/midas/__init__.py
Normal file
16
comfy/annotator/midas/midas/base_model.py
Normal file
16
comfy/annotator/midas/midas/base_model.py
Normal file
@ -0,0 +1,16 @@
|
||||
import torch
|
||||
|
||||
|
||||
class BaseModel(torch.nn.Module):
|
||||
def load(self, path):
|
||||
"""Load model from file.
|
||||
|
||||
Args:
|
||||
path (str): file path
|
||||
"""
|
||||
parameters = torch.load(path, map_location=torch.device('cpu'))
|
||||
|
||||
if "optimizer" in parameters:
|
||||
parameters = parameters["model"]
|
||||
|
||||
self.load_state_dict(parameters)
|
||||
342
comfy/annotator/midas/midas/blocks.py
Normal file
342
comfy/annotator/midas/midas/blocks.py
Normal file
@ -0,0 +1,342 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from .vit import (
|
||||
_make_pretrained_vitb_rn50_384,
|
||||
_make_pretrained_vitl16_384,
|
||||
_make_pretrained_vitb16_384,
|
||||
forward_vit,
|
||||
)
|
||||
|
||||
def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, use_vit_only=False, use_readout="ignore",):
|
||||
if backbone == "vitl16_384":
|
||||
pretrained = _make_pretrained_vitl16_384(
|
||||
use_pretrained, hooks=hooks, use_readout=use_readout
|
||||
)
|
||||
scratch = _make_scratch(
|
||||
[256, 512, 1024, 1024], features, groups=groups, expand=expand
|
||||
) # ViT-L/16 - 85.0% Top1 (backbone)
|
||||
elif backbone == "vitb_rn50_384":
|
||||
pretrained = _make_pretrained_vitb_rn50_384(
|
||||
use_pretrained,
|
||||
hooks=hooks,
|
||||
use_vit_only=use_vit_only,
|
||||
use_readout=use_readout,
|
||||
)
|
||||
scratch = _make_scratch(
|
||||
[256, 512, 768, 768], features, groups=groups, expand=expand
|
||||
) # ViT-H/16 - 85.0% Top1 (backbone)
|
||||
elif backbone == "vitb16_384":
|
||||
pretrained = _make_pretrained_vitb16_384(
|
||||
use_pretrained, hooks=hooks, use_readout=use_readout
|
||||
)
|
||||
scratch = _make_scratch(
|
||||
[96, 192, 384, 768], features, groups=groups, expand=expand
|
||||
) # ViT-B/16 - 84.6% Top1 (backbone)
|
||||
elif backbone == "resnext101_wsl":
|
||||
pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
|
||||
scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand) # efficientnet_lite3
|
||||
elif backbone == "efficientnet_lite3":
|
||||
pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable)
|
||||
scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand) # efficientnet_lite3
|
||||
else:
|
||||
print(f"Backbone '{backbone}' not implemented")
|
||||
assert False
|
||||
|
||||
return pretrained, scratch
|
||||
|
||||
|
||||
def _make_scratch(in_shape, out_shape, groups=1, expand=False):
|
||||
scratch = nn.Module()
|
||||
|
||||
out_shape1 = out_shape
|
||||
out_shape2 = out_shape
|
||||
out_shape3 = out_shape
|
||||
out_shape4 = out_shape
|
||||
if expand==True:
|
||||
out_shape1 = out_shape
|
||||
out_shape2 = out_shape*2
|
||||
out_shape3 = out_shape*4
|
||||
out_shape4 = out_shape*8
|
||||
|
||||
scratch.layer1_rn = nn.Conv2d(
|
||||
in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
||||
)
|
||||
scratch.layer2_rn = nn.Conv2d(
|
||||
in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
||||
)
|
||||
scratch.layer3_rn = nn.Conv2d(
|
||||
in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
||||
)
|
||||
scratch.layer4_rn = nn.Conv2d(
|
||||
in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
||||
)
|
||||
|
||||
return scratch
|
||||
|
||||
|
||||
def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
|
||||
efficientnet = torch.hub.load(
|
||||
"rwightman/gen-efficientnet-pytorch",
|
||||
"tf_efficientnet_lite3",
|
||||
pretrained=use_pretrained,
|
||||
exportable=exportable
|
||||
)
|
||||
return _make_efficientnet_backbone(efficientnet)
|
||||
|
||||
|
||||
def _make_efficientnet_backbone(effnet):
|
||||
pretrained = nn.Module()
|
||||
|
||||
pretrained.layer1 = nn.Sequential(
|
||||
effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2]
|
||||
)
|
||||
pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
|
||||
pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
|
||||
pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
|
||||
|
||||
return pretrained
|
||||
|
||||
|
||||
def _make_resnet_backbone(resnet):
|
||||
pretrained = nn.Module()
|
||||
pretrained.layer1 = nn.Sequential(
|
||||
resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1
|
||||
)
|
||||
|
||||
pretrained.layer2 = resnet.layer2
|
||||
pretrained.layer3 = resnet.layer3
|
||||
pretrained.layer4 = resnet.layer4
|
||||
|
||||
return pretrained
|
||||
|
||||
|
||||
def _make_pretrained_resnext101_wsl(use_pretrained):
|
||||
resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")
|
||||
return _make_resnet_backbone(resnet)
|
||||
|
||||
|
||||
|
||||
class Interpolate(nn.Module):
|
||||
"""Interpolation module.
|
||||
"""
|
||||
|
||||
def __init__(self, scale_factor, mode, align_corners=False):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
scale_factor (float): scaling
|
||||
mode (str): interpolation mode
|
||||
"""
|
||||
super(Interpolate, self).__init__()
|
||||
|
||||
self.interp = nn.functional.interpolate
|
||||
self.scale_factor = scale_factor
|
||||
self.mode = mode
|
||||
self.align_corners = align_corners
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass.
|
||||
|
||||
Args:
|
||||
x (tensor): input
|
||||
|
||||
Returns:
|
||||
tensor: interpolated data
|
||||
"""
|
||||
|
||||
x = self.interp(
|
||||
x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners
|
||||
)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class ResidualConvUnit(nn.Module):
|
||||
"""Residual convolution module.
|
||||
"""
|
||||
|
||||
def __init__(self, features):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
features (int): number of features
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self.conv1 = nn.Conv2d(
|
||||
features, features, kernel_size=3, stride=1, padding=1, bias=True
|
||||
)
|
||||
|
||||
self.conv2 = nn.Conv2d(
|
||||
features, features, kernel_size=3, stride=1, padding=1, bias=True
|
||||
)
|
||||
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass.
|
||||
|
||||
Args:
|
||||
x (tensor): input
|
||||
|
||||
Returns:
|
||||
tensor: output
|
||||
"""
|
||||
out = self.relu(x)
|
||||
out = self.conv1(out)
|
||||
out = self.relu(out)
|
||||
out = self.conv2(out)
|
||||
|
||||
return out + x
|
||||
|
||||
|
||||
class FeatureFusionBlock(nn.Module):
|
||||
"""Feature fusion block.
|
||||
"""
|
||||
|
||||
def __init__(self, features):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
features (int): number of features
|
||||
"""
|
||||
super(FeatureFusionBlock, self).__init__()
|
||||
|
||||
self.resConfUnit1 = ResidualConvUnit(features)
|
||||
self.resConfUnit2 = ResidualConvUnit(features)
|
||||
|
||||
def forward(self, *xs):
|
||||
"""Forward pass.
|
||||
|
||||
Returns:
|
||||
tensor: output
|
||||
"""
|
||||
output = xs[0]
|
||||
|
||||
if len(xs) == 2:
|
||||
output += self.resConfUnit1(xs[1])
|
||||
|
||||
output = self.resConfUnit2(output)
|
||||
|
||||
output = nn.functional.interpolate(
|
||||
output, scale_factor=2, mode="bilinear", align_corners=True
|
||||
)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
|
||||
|
||||
class ResidualConvUnit_custom(nn.Module):
|
||||
"""Residual convolution module.
|
||||
"""
|
||||
|
||||
def __init__(self, features, activation, bn):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
features (int): number of features
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self.bn = bn
|
||||
|
||||
self.groups=1
|
||||
|
||||
self.conv1 = nn.Conv2d(
|
||||
features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
|
||||
)
|
||||
|
||||
self.conv2 = nn.Conv2d(
|
||||
features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
|
||||
)
|
||||
|
||||
if self.bn==True:
|
||||
self.bn1 = nn.BatchNorm2d(features)
|
||||
self.bn2 = nn.BatchNorm2d(features)
|
||||
|
||||
self.activation = activation
|
||||
|
||||
self.skip_add = nn.quantized.FloatFunctional()
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass.
|
||||
|
||||
Args:
|
||||
x (tensor): input
|
||||
|
||||
Returns:
|
||||
tensor: output
|
||||
"""
|
||||
|
||||
out = self.activation(x)
|
||||
out = self.conv1(out)
|
||||
if self.bn==True:
|
||||
out = self.bn1(out)
|
||||
|
||||
out = self.activation(out)
|
||||
out = self.conv2(out)
|
||||
if self.bn==True:
|
||||
out = self.bn2(out)
|
||||
|
||||
if self.groups > 1:
|
||||
out = self.conv_merge(out)
|
||||
|
||||
return self.skip_add.add(out, x)
|
||||
|
||||
# return out + x
|
||||
|
||||
|
||||
class FeatureFusionBlock_custom(nn.Module):
|
||||
"""Feature fusion block.
|
||||
"""
|
||||
|
||||
def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
features (int): number of features
|
||||
"""
|
||||
super(FeatureFusionBlock_custom, self).__init__()
|
||||
|
||||
self.deconv = deconv
|
||||
self.align_corners = align_corners
|
||||
|
||||
self.groups=1
|
||||
|
||||
self.expand = expand
|
||||
out_features = features
|
||||
if self.expand==True:
|
||||
out_features = features//2
|
||||
|
||||
self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
|
||||
|
||||
self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
|
||||
self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
|
||||
|
||||
self.skip_add = nn.quantized.FloatFunctional()
|
||||
|
||||
def forward(self, *xs):
|
||||
"""Forward pass.
|
||||
|
||||
Returns:
|
||||
tensor: output
|
||||
"""
|
||||
output = xs[0]
|
||||
|
||||
if len(xs) == 2:
|
||||
res = self.resConfUnit1(xs[1])
|
||||
output = self.skip_add.add(output, res)
|
||||
# output += res
|
||||
|
||||
output = self.resConfUnit2(output)
|
||||
|
||||
output = nn.functional.interpolate(
|
||||
output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
|
||||
)
|
||||
|
||||
output = self.out_conv(output)
|
||||
|
||||
return output
|
||||
|
||||
109
comfy/annotator/midas/midas/dpt_depth.py
Normal file
109
comfy/annotator/midas/midas/dpt_depth.py
Normal file
@ -0,0 +1,109 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .base_model import BaseModel
|
||||
from .blocks import (
|
||||
FeatureFusionBlock,
|
||||
FeatureFusionBlock_custom,
|
||||
Interpolate,
|
||||
_make_encoder,
|
||||
forward_vit,
|
||||
)
|
||||
|
||||
|
||||
def _make_fusion_block(features, use_bn):
|
||||
return FeatureFusionBlock_custom(
|
||||
features,
|
||||
nn.ReLU(False),
|
||||
deconv=False,
|
||||
bn=use_bn,
|
||||
expand=False,
|
||||
align_corners=True,
|
||||
)
|
||||
|
||||
|
||||
class DPT(BaseModel):
|
||||
def __init__(
|
||||
self,
|
||||
head,
|
||||
features=256,
|
||||
backbone="vitb_rn50_384",
|
||||
readout="project",
|
||||
channels_last=False,
|
||||
use_bn=False,
|
||||
):
|
||||
|
||||
super(DPT, self).__init__()
|
||||
|
||||
self.channels_last = channels_last
|
||||
|
||||
hooks = {
|
||||
"vitb_rn50_384": [0, 1, 8, 11],
|
||||
"vitb16_384": [2, 5, 8, 11],
|
||||
"vitl16_384": [5, 11, 17, 23],
|
||||
}
|
||||
|
||||
# Instantiate backbone and reassemble blocks
|
||||
self.pretrained, self.scratch = _make_encoder(
|
||||
backbone,
|
||||
features,
|
||||
False, # Set to true of you want to train from scratch, uses ImageNet weights
|
||||
groups=1,
|
||||
expand=False,
|
||||
exportable=False,
|
||||
hooks=hooks[backbone],
|
||||
use_readout=readout,
|
||||
)
|
||||
|
||||
self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
|
||||
self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
|
||||
self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
|
||||
self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
|
||||
|
||||
self.scratch.output_conv = head
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
if self.channels_last == True:
|
||||
x.contiguous(memory_format=torch.channels_last)
|
||||
|
||||
layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
|
||||
|
||||
layer_1_rn = self.scratch.layer1_rn(layer_1)
|
||||
layer_2_rn = self.scratch.layer2_rn(layer_2)
|
||||
layer_3_rn = self.scratch.layer3_rn(layer_3)
|
||||
layer_4_rn = self.scratch.layer4_rn(layer_4)
|
||||
|
||||
path_4 = self.scratch.refinenet4(layer_4_rn)
|
||||
path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
|
||||
path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
|
||||
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
|
||||
|
||||
out = self.scratch.output_conv(path_1)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class DPTDepthModel(DPT):
|
||||
def __init__(self, path=None, non_negative=True, **kwargs):
|
||||
features = kwargs["features"] if "features" in kwargs else 256
|
||||
|
||||
head = nn.Sequential(
|
||||
nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
|
||||
Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
|
||||
nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
|
||||
nn.ReLU(True),
|
||||
nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
|
||||
nn.ReLU(True) if non_negative else nn.Identity(),
|
||||
nn.Identity(),
|
||||
)
|
||||
|
||||
super().__init__(head, **kwargs)
|
||||
|
||||
if path is not None:
|
||||
self.load(path)
|
||||
|
||||
def forward(self, x):
|
||||
return super().forward(x).squeeze(dim=1)
|
||||
|
||||
76
comfy/annotator/midas/midas/midas_net.py
Normal file
76
comfy/annotator/midas/midas/midas_net.py
Normal file
@ -0,0 +1,76 @@
|
||||
"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
|
||||
This file contains code that is adapted from
|
||||
https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
|
||||
"""
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from .base_model import BaseModel
|
||||
from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
|
||||
|
||||
|
||||
class MidasNet(BaseModel):
|
||||
"""Network for monocular depth estimation.
|
||||
"""
|
||||
|
||||
def __init__(self, path=None, features=256, non_negative=True):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
path (str, optional): Path to saved model. Defaults to None.
|
||||
features (int, optional): Number of features. Defaults to 256.
|
||||
backbone (str, optional): Backbone network for encoder. Defaults to resnet50
|
||||
"""
|
||||
print("Loading weights: ", path)
|
||||
|
||||
super(MidasNet, self).__init__()
|
||||
|
||||
use_pretrained = False if path is None else True
|
||||
|
||||
self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained)
|
||||
|
||||
self.scratch.refinenet4 = FeatureFusionBlock(features)
|
||||
self.scratch.refinenet3 = FeatureFusionBlock(features)
|
||||
self.scratch.refinenet2 = FeatureFusionBlock(features)
|
||||
self.scratch.refinenet1 = FeatureFusionBlock(features)
|
||||
|
||||
self.scratch.output_conv = nn.Sequential(
|
||||
nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
|
||||
Interpolate(scale_factor=2, mode="bilinear"),
|
||||
nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
|
||||
nn.ReLU(True),
|
||||
nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
|
||||
nn.ReLU(True) if non_negative else nn.Identity(),
|
||||
)
|
||||
|
||||
if path:
|
||||
self.load(path)
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass.
|
||||
|
||||
Args:
|
||||
x (tensor): input data (image)
|
||||
|
||||
Returns:
|
||||
tensor: depth
|
||||
"""
|
||||
|
||||
layer_1 = self.pretrained.layer1(x)
|
||||
layer_2 = self.pretrained.layer2(layer_1)
|
||||
layer_3 = self.pretrained.layer3(layer_2)
|
||||
layer_4 = self.pretrained.layer4(layer_3)
|
||||
|
||||
layer_1_rn = self.scratch.layer1_rn(layer_1)
|
||||
layer_2_rn = self.scratch.layer2_rn(layer_2)
|
||||
layer_3_rn = self.scratch.layer3_rn(layer_3)
|
||||
layer_4_rn = self.scratch.layer4_rn(layer_4)
|
||||
|
||||
path_4 = self.scratch.refinenet4(layer_4_rn)
|
||||
path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
|
||||
path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
|
||||
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
|
||||
|
||||
out = self.scratch.output_conv(path_1)
|
||||
|
||||
return torch.squeeze(out, dim=1)
|
||||
128
comfy/annotator/midas/midas/midas_net_custom.py
Normal file
128
comfy/annotator/midas/midas/midas_net_custom.py
Normal file
@ -0,0 +1,128 @@
|
||||
"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
|
||||
This file contains code that is adapted from
|
||||
https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
|
||||
"""
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from .base_model import BaseModel
|
||||
from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder
|
||||
|
||||
|
||||
class MidasNet_small(BaseModel):
|
||||
"""Network for monocular depth estimation.
|
||||
"""
|
||||
|
||||
def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True,
|
||||
blocks={'expand': True}):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
path (str, optional): Path to saved model. Defaults to None.
|
||||
features (int, optional): Number of features. Defaults to 256.
|
||||
backbone (str, optional): Backbone network for encoder. Defaults to resnet50
|
||||
"""
|
||||
print("Loading weights: ", path)
|
||||
|
||||
super(MidasNet_small, self).__init__()
|
||||
|
||||
use_pretrained = False if path else True
|
||||
|
||||
self.channels_last = channels_last
|
||||
self.blocks = blocks
|
||||
self.backbone = backbone
|
||||
|
||||
self.groups = 1
|
||||
|
||||
features1=features
|
||||
features2=features
|
||||
features3=features
|
||||
features4=features
|
||||
self.expand = False
|
||||
if "expand" in self.blocks and self.blocks['expand'] == True:
|
||||
self.expand = True
|
||||
features1=features
|
||||
features2=features*2
|
||||
features3=features*4
|
||||
features4=features*8
|
||||
|
||||
self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable)
|
||||
|
||||
self.scratch.activation = nn.ReLU(False)
|
||||
|
||||
self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
|
||||
self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
|
||||
self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
|
||||
self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners)
|
||||
|
||||
|
||||
self.scratch.output_conv = nn.Sequential(
|
||||
nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups),
|
||||
Interpolate(scale_factor=2, mode="bilinear"),
|
||||
nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1),
|
||||
self.scratch.activation,
|
||||
nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
|
||||
nn.ReLU(True) if non_negative else nn.Identity(),
|
||||
nn.Identity(),
|
||||
)
|
||||
|
||||
if path:
|
||||
self.load(path)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward pass.
|
||||
|
||||
Args:
|
||||
x (tensor): input data (image)
|
||||
|
||||
Returns:
|
||||
tensor: depth
|
||||
"""
|
||||
if self.channels_last==True:
|
||||
print("self.channels_last = ", self.channels_last)
|
||||
x.contiguous(memory_format=torch.channels_last)
|
||||
|
||||
|
||||
layer_1 = self.pretrained.layer1(x)
|
||||
layer_2 = self.pretrained.layer2(layer_1)
|
||||
layer_3 = self.pretrained.layer3(layer_2)
|
||||
layer_4 = self.pretrained.layer4(layer_3)
|
||||
|
||||
layer_1_rn = self.scratch.layer1_rn(layer_1)
|
||||
layer_2_rn = self.scratch.layer2_rn(layer_2)
|
||||
layer_3_rn = self.scratch.layer3_rn(layer_3)
|
||||
layer_4_rn = self.scratch.layer4_rn(layer_4)
|
||||
|
||||
|
||||
path_4 = self.scratch.refinenet4(layer_4_rn)
|
||||
path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
|
||||
path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
|
||||
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
|
||||
|
||||
out = self.scratch.output_conv(path_1)
|
||||
|
||||
return torch.squeeze(out, dim=1)
|
||||
|
||||
|
||||
|
||||
def fuse_model(m):
|
||||
prev_previous_type = nn.Identity()
|
||||
prev_previous_name = ''
|
||||
previous_type = nn.Identity()
|
||||
previous_name = ''
|
||||
for name, module in m.named_modules():
|
||||
if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU:
|
||||
# print("FUSED ", prev_previous_name, previous_name, name)
|
||||
torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True)
|
||||
elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d:
|
||||
# print("FUSED ", prev_previous_name, previous_name)
|
||||
torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True)
|
||||
# elif previous_type == nn.Conv2d and type(module) == nn.ReLU:
|
||||
# print("FUSED ", previous_name, name)
|
||||
# torch.quantization.fuse_modules(m, [previous_name, name], inplace=True)
|
||||
|
||||
prev_previous_type = previous_type
|
||||
prev_previous_name = previous_name
|
||||
previous_type = type(module)
|
||||
previous_name = name
|
||||
234
comfy/annotator/midas/midas/transforms.py
Normal file
234
comfy/annotator/midas/midas/transforms.py
Normal file
@ -0,0 +1,234 @@
|
||||
import numpy as np
|
||||
import cv2
|
||||
import math
|
||||
|
||||
|
||||
def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
|
||||
"""Rezise the sample to ensure the given size. Keeps aspect ratio.
|
||||
|
||||
Args:
|
||||
sample (dict): sample
|
||||
size (tuple): image size
|
||||
|
||||
Returns:
|
||||
tuple: new size
|
||||
"""
|
||||
shape = list(sample["disparity"].shape)
|
||||
|
||||
if shape[0] >= size[0] and shape[1] >= size[1]:
|
||||
return sample
|
||||
|
||||
scale = [0, 0]
|
||||
scale[0] = size[0] / shape[0]
|
||||
scale[1] = size[1] / shape[1]
|
||||
|
||||
scale = max(scale)
|
||||
|
||||
shape[0] = math.ceil(scale * shape[0])
|
||||
shape[1] = math.ceil(scale * shape[1])
|
||||
|
||||
# resize
|
||||
sample["image"] = cv2.resize(
|
||||
sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
|
||||
)
|
||||
|
||||
sample["disparity"] = cv2.resize(
|
||||
sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
|
||||
)
|
||||
sample["mask"] = cv2.resize(
|
||||
sample["mask"].astype(np.float32),
|
||||
tuple(shape[::-1]),
|
||||
interpolation=cv2.INTER_NEAREST,
|
||||
)
|
||||
sample["mask"] = sample["mask"].astype(bool)
|
||||
|
||||
return tuple(shape)
|
||||
|
||||
|
||||
class Resize(object):
|
||||
"""Resize sample to given size (width, height).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
width,
|
||||
height,
|
||||
resize_target=True,
|
||||
keep_aspect_ratio=False,
|
||||
ensure_multiple_of=1,
|
||||
resize_method="lower_bound",
|
||||
image_interpolation_method=cv2.INTER_AREA,
|
||||
):
|
||||
"""Init.
|
||||
|
||||
Args:
|
||||
width (int): desired output width
|
||||
height (int): desired output height
|
||||
resize_target (bool, optional):
|
||||
True: Resize the full sample (image, mask, target).
|
||||
False: Resize image only.
|
||||
Defaults to True.
|
||||
keep_aspect_ratio (bool, optional):
|
||||
True: Keep the aspect ratio of the input sample.
|
||||
Output sample might not have the given width and height, and
|
||||
resize behaviour depends on the parameter 'resize_method'.
|
||||
Defaults to False.
|
||||
ensure_multiple_of (int, optional):
|
||||
Output width and height is constrained to be multiple of this parameter.
|
||||
Defaults to 1.
|
||||
resize_method (str, optional):
|
||||
"lower_bound": Output will be at least as large as the given size.
|
||||
"upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
|
||||
"minimal": Scale as least as possible. (Output size might be smaller than given size.)
|
||||
Defaults to "lower_bound".
|
||||
"""
|
||||
self.__width = width
|
||||
self.__height = height
|
||||
|
||||
self.__resize_target = resize_target
|
||||
self.__keep_aspect_ratio = keep_aspect_ratio
|
||||
self.__multiple_of = ensure_multiple_of
|
||||
self.__resize_method = resize_method
|
||||
self.__image_interpolation_method = image_interpolation_method
|
||||
|
||||
def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
|
||||
y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
||||
|
||||
if max_val is not None and y > max_val:
|
||||
y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
||||
|
||||
if y < min_val:
|
||||
y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
||||
|
||||
return y
|
||||
|
||||
def get_size(self, width, height):
|
||||
# determine new height and width
|
||||
scale_height = self.__height / height
|
||||
scale_width = self.__width / width
|
||||
|
||||
if self.__keep_aspect_ratio:
|
||||
if self.__resize_method == "lower_bound":
|
||||
# scale such that output size is lower bound
|
||||
if scale_width > scale_height:
|
||||
# fit width
|
||||
scale_height = scale_width
|
||||
else:
|
||||
# fit height
|
||||
scale_width = scale_height
|
||||
elif self.__resize_method == "upper_bound":
|
||||
# scale such that output size is upper bound
|
||||
if scale_width < scale_height:
|
||||
# fit width
|
||||
scale_height = scale_width
|
||||
else:
|
||||
# fit height
|
||||
scale_width = scale_height
|
||||
elif self.__resize_method == "minimal":
|
||||
# scale as least as possbile
|
||||
if abs(1 - scale_width) < abs(1 - scale_height):
|
||||
# fit width
|
||||
scale_height = scale_width
|
||||
else:
|
||||
# fit height
|
||||
scale_width = scale_height
|
||||
else:
|
||||
raise ValueError(
|
||||
f"resize_method {self.__resize_method} not implemented"
|
||||
)
|
||||
|
||||
if self.__resize_method == "lower_bound":
|
||||
new_height = self.constrain_to_multiple_of(
|
||||
scale_height * height, min_val=self.__height
|
||||
)
|
||||
new_width = self.constrain_to_multiple_of(
|
||||
scale_width * width, min_val=self.__width
|
||||
)
|
||||
elif self.__resize_method == "upper_bound":
|
||||
new_height = self.constrain_to_multiple_of(
|
||||
scale_height * height, max_val=self.__height
|
||||
)
|
||||
new_width = self.constrain_to_multiple_of(
|
||||
scale_width * width, max_val=self.__width
|
||||
)
|
||||
elif self.__resize_method == "minimal":
|
||||
new_height = self.constrain_to_multiple_of(scale_height * height)
|
||||
new_width = self.constrain_to_multiple_of(scale_width * width)
|
||||
else:
|
||||
raise ValueError(f"resize_method {self.__resize_method} not implemented")
|
||||
|
||||
return (new_width, new_height)
|
||||
|
||||
def __call__(self, sample):
|
||||
width, height = self.get_size(
|
||||
sample["image"].shape[1], sample["image"].shape[0]
|
||||
)
|
||||
|
||||
# resize sample
|
||||
sample["image"] = cv2.resize(
|
||||
sample["image"],
|
||||
(width, height),
|
||||
interpolation=self.__image_interpolation_method,
|
||||
)
|
||||
|
||||
if self.__resize_target:
|
||||
if "disparity" in sample:
|
||||
sample["disparity"] = cv2.resize(
|
||||
sample["disparity"],
|
||||
(width, height),
|
||||
interpolation=cv2.INTER_NEAREST,
|
||||
)
|
||||
|
||||
if "depth" in sample:
|
||||
sample["depth"] = cv2.resize(
|
||||
sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
|
||||
)
|
||||
|
||||
sample["mask"] = cv2.resize(
|
||||
sample["mask"].astype(np.float32),
|
||||
(width, height),
|
||||
interpolation=cv2.INTER_NEAREST,
|
||||
)
|
||||
sample["mask"] = sample["mask"].astype(bool)
|
||||
|
||||
return sample
|
||||
|
||||
|
||||
class NormalizeImage(object):
|
||||
"""Normlize image by given mean and std.
|
||||
"""
|
||||
|
||||
def __init__(self, mean, std):
|
||||
self.__mean = mean
|
||||
self.__std = std
|
||||
|
||||
def __call__(self, sample):
|
||||
sample["image"] = (sample["image"] - self.__mean) / self.__std
|
||||
|
||||
return sample
|
||||
|
||||
|
||||
class PrepareForNet(object):
|
||||
"""Prepare sample for usage as network input.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __call__(self, sample):
|
||||
image = np.transpose(sample["image"], (2, 0, 1))
|
||||
sample["image"] = np.ascontiguousarray(image).astype(np.float32)
|
||||
|
||||
if "mask" in sample:
|
||||
sample["mask"] = sample["mask"].astype(np.float32)
|
||||
sample["mask"] = np.ascontiguousarray(sample["mask"])
|
||||
|
||||
if "disparity" in sample:
|
||||
disparity = sample["disparity"].astype(np.float32)
|
||||
sample["disparity"] = np.ascontiguousarray(disparity)
|
||||
|
||||
if "depth" in sample:
|
||||
depth = sample["depth"].astype(np.float32)
|
||||
sample["depth"] = np.ascontiguousarray(depth)
|
||||
|
||||
return sample
|
||||
491
comfy/annotator/midas/midas/vit.py
Normal file
491
comfy/annotator/midas/midas/vit.py
Normal file
@ -0,0 +1,491 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import timm
|
||||
import types
|
||||
import math
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class Slice(nn.Module):
|
||||
def __init__(self, start_index=1):
|
||||
super(Slice, self).__init__()
|
||||
self.start_index = start_index
|
||||
|
||||
def forward(self, x):
|
||||
return x[:, self.start_index :]
|
||||
|
||||
|
||||
class AddReadout(nn.Module):
|
||||
def __init__(self, start_index=1):
|
||||
super(AddReadout, self).__init__()
|
||||
self.start_index = start_index
|
||||
|
||||
def forward(self, x):
|
||||
if self.start_index == 2:
|
||||
readout = (x[:, 0] + x[:, 1]) / 2
|
||||
else:
|
||||
readout = x[:, 0]
|
||||
return x[:, self.start_index :] + readout.unsqueeze(1)
|
||||
|
||||
|
||||
class ProjectReadout(nn.Module):
|
||||
def __init__(self, in_features, start_index=1):
|
||||
super(ProjectReadout, self).__init__()
|
||||
self.start_index = start_index
|
||||
|
||||
self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
|
||||
|
||||
def forward(self, x):
|
||||
readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
|
||||
features = torch.cat((x[:, self.start_index :], readout), -1)
|
||||
|
||||
return self.project(features)
|
||||
|
||||
|
||||
class Transpose(nn.Module):
|
||||
def __init__(self, dim0, dim1):
|
||||
super(Transpose, self).__init__()
|
||||
self.dim0 = dim0
|
||||
self.dim1 = dim1
|
||||
|
||||
def forward(self, x):
|
||||
x = x.transpose(self.dim0, self.dim1)
|
||||
return x
|
||||
|
||||
|
||||
def forward_vit(pretrained, x):
|
||||
b, c, h, w = x.shape
|
||||
|
||||
glob = pretrained.model.forward_flex(x)
|
||||
|
||||
layer_1 = pretrained.activations["1"]
|
||||
layer_2 = pretrained.activations["2"]
|
||||
layer_3 = pretrained.activations["3"]
|
||||
layer_4 = pretrained.activations["4"]
|
||||
|
||||
layer_1 = pretrained.act_postprocess1[0:2](layer_1)
|
||||
layer_2 = pretrained.act_postprocess2[0:2](layer_2)
|
||||
layer_3 = pretrained.act_postprocess3[0:2](layer_3)
|
||||
layer_4 = pretrained.act_postprocess4[0:2](layer_4)
|
||||
|
||||
unflatten = nn.Sequential(
|
||||
nn.Unflatten(
|
||||
2,
|
||||
torch.Size(
|
||||
[
|
||||
h // pretrained.model.patch_size[1],
|
||||
w // pretrained.model.patch_size[0],
|
||||
]
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
if layer_1.ndim == 3:
|
||||
layer_1 = unflatten(layer_1)
|
||||
if layer_2.ndim == 3:
|
||||
layer_2 = unflatten(layer_2)
|
||||
if layer_3.ndim == 3:
|
||||
layer_3 = unflatten(layer_3)
|
||||
if layer_4.ndim == 3:
|
||||
layer_4 = unflatten(layer_4)
|
||||
|
||||
layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
|
||||
layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
|
||||
layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
|
||||
layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
|
||||
|
||||
return layer_1, layer_2, layer_3, layer_4
|
||||
|
||||
|
||||
def _resize_pos_embed(self, posemb, gs_h, gs_w):
|
||||
posemb_tok, posemb_grid = (
|
||||
posemb[:, : self.start_index],
|
||||
posemb[0, self.start_index :],
|
||||
)
|
||||
|
||||
gs_old = int(math.sqrt(len(posemb_grid)))
|
||||
|
||||
posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
|
||||
posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
|
||||
posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
|
||||
|
||||
posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
|
||||
|
||||
return posemb
|
||||
|
||||
|
||||
def forward_flex(self, x):
|
||||
b, c, h, w = x.shape
|
||||
|
||||
pos_embed = self._resize_pos_embed(
|
||||
self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
|
||||
)
|
||||
|
||||
B = x.shape[0]
|
||||
|
||||
if hasattr(self.patch_embed, "backbone"):
|
||||
x = self.patch_embed.backbone(x)
|
||||
if isinstance(x, (list, tuple)):
|
||||
x = x[-1] # last feature if backbone outputs list/tuple of features
|
||||
|
||||
x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
|
||||
|
||||
if getattr(self, "dist_token", None) is not None:
|
||||
cls_tokens = self.cls_token.expand(
|
||||
B, -1, -1
|
||||
) # stole cls_tokens impl from Phil Wang, thanks
|
||||
dist_token = self.dist_token.expand(B, -1, -1)
|
||||
x = torch.cat((cls_tokens, dist_token, x), dim=1)
|
||||
else:
|
||||
cls_tokens = self.cls_token.expand(
|
||||
B, -1, -1
|
||||
) # stole cls_tokens impl from Phil Wang, thanks
|
||||
x = torch.cat((cls_tokens, x), dim=1)
|
||||
|
||||
x = x + pos_embed
|
||||
x = self.pos_drop(x)
|
||||
|
||||
for blk in self.blocks:
|
||||
x = blk(x)
|
||||
|
||||
x = self.norm(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
activations = {}
|
||||
|
||||
|
||||
def get_activation(name):
|
||||
def hook(model, input, output):
|
||||
activations[name] = output
|
||||
|
||||
return hook
|
||||
|
||||
|
||||
def get_readout_oper(vit_features, features, use_readout, start_index=1):
|
||||
if use_readout == "ignore":
|
||||
readout_oper = [Slice(start_index)] * len(features)
|
||||
elif use_readout == "add":
|
||||
readout_oper = [AddReadout(start_index)] * len(features)
|
||||
elif use_readout == "project":
|
||||
readout_oper = [
|
||||
ProjectReadout(vit_features, start_index) for out_feat in features
|
||||
]
|
||||
else:
|
||||
assert (
|
||||
False
|
||||
), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
|
||||
|
||||
return readout_oper
|
||||
|
||||
|
||||
def _make_vit_b16_backbone(
|
||||
model,
|
||||
features=[96, 192, 384, 768],
|
||||
size=[384, 384],
|
||||
hooks=[2, 5, 8, 11],
|
||||
vit_features=768,
|
||||
use_readout="ignore",
|
||||
start_index=1,
|
||||
):
|
||||
pretrained = nn.Module()
|
||||
|
||||
pretrained.model = model
|
||||
pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
|
||||
pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
|
||||
pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
|
||||
pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
|
||||
|
||||
pretrained.activations = activations
|
||||
|
||||
readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
|
||||
|
||||
# 32, 48, 136, 384
|
||||
pretrained.act_postprocess1 = nn.Sequential(
|
||||
readout_oper[0],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[0],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
nn.ConvTranspose2d(
|
||||
in_channels=features[0],
|
||||
out_channels=features[0],
|
||||
kernel_size=4,
|
||||
stride=4,
|
||||
padding=0,
|
||||
bias=True,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
),
|
||||
)
|
||||
|
||||
pretrained.act_postprocess2 = nn.Sequential(
|
||||
readout_oper[1],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[1],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
nn.ConvTranspose2d(
|
||||
in_channels=features[1],
|
||||
out_channels=features[1],
|
||||
kernel_size=2,
|
||||
stride=2,
|
||||
padding=0,
|
||||
bias=True,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
),
|
||||
)
|
||||
|
||||
pretrained.act_postprocess3 = nn.Sequential(
|
||||
readout_oper[2],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[2],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
)
|
||||
|
||||
pretrained.act_postprocess4 = nn.Sequential(
|
||||
readout_oper[3],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[3],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
nn.Conv2d(
|
||||
in_channels=features[3],
|
||||
out_channels=features[3],
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
),
|
||||
)
|
||||
|
||||
pretrained.model.start_index = start_index
|
||||
pretrained.model.patch_size = [16, 16]
|
||||
|
||||
# We inject this function into the VisionTransformer instances so that
|
||||
# we can use it with interpolated position embeddings without modifying the library source.
|
||||
pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
|
||||
pretrained.model._resize_pos_embed = types.MethodType(
|
||||
_resize_pos_embed, pretrained.model
|
||||
)
|
||||
|
||||
return pretrained
|
||||
|
||||
|
||||
def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None):
|
||||
model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
|
||||
|
||||
hooks = [5, 11, 17, 23] if hooks == None else hooks
|
||||
return _make_vit_b16_backbone(
|
||||
model,
|
||||
features=[256, 512, 1024, 1024],
|
||||
hooks=hooks,
|
||||
vit_features=1024,
|
||||
use_readout=use_readout,
|
||||
)
|
||||
|
||||
|
||||
def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None):
|
||||
model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
|
||||
|
||||
hooks = [2, 5, 8, 11] if hooks == None else hooks
|
||||
return _make_vit_b16_backbone(
|
||||
model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
|
||||
)
|
||||
|
||||
|
||||
def _make_pretrained_deitb16_384(pretrained, use_readout="ignore", hooks=None):
|
||||
model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained)
|
||||
|
||||
hooks = [2, 5, 8, 11] if hooks == None else hooks
|
||||
return _make_vit_b16_backbone(
|
||||
model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
|
||||
)
|
||||
|
||||
|
||||
def _make_pretrained_deitb16_distil_384(pretrained, use_readout="ignore", hooks=None):
|
||||
model = timm.create_model(
|
||||
"vit_deit_base_distilled_patch16_384", pretrained=pretrained
|
||||
)
|
||||
|
||||
hooks = [2, 5, 8, 11] if hooks == None else hooks
|
||||
return _make_vit_b16_backbone(
|
||||
model,
|
||||
features=[96, 192, 384, 768],
|
||||
hooks=hooks,
|
||||
use_readout=use_readout,
|
||||
start_index=2,
|
||||
)
|
||||
|
||||
|
||||
def _make_vit_b_rn50_backbone(
|
||||
model,
|
||||
features=[256, 512, 768, 768],
|
||||
size=[384, 384],
|
||||
hooks=[0, 1, 8, 11],
|
||||
vit_features=768,
|
||||
use_vit_only=False,
|
||||
use_readout="ignore",
|
||||
start_index=1,
|
||||
):
|
||||
pretrained = nn.Module()
|
||||
|
||||
pretrained.model = model
|
||||
|
||||
if use_vit_only == True:
|
||||
pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
|
||||
pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
|
||||
else:
|
||||
pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
|
||||
get_activation("1")
|
||||
)
|
||||
pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
|
||||
get_activation("2")
|
||||
)
|
||||
|
||||
pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
|
||||
pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
|
||||
|
||||
pretrained.activations = activations
|
||||
|
||||
readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
|
||||
|
||||
if use_vit_only == True:
|
||||
pretrained.act_postprocess1 = nn.Sequential(
|
||||
readout_oper[0],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[0],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
nn.ConvTranspose2d(
|
||||
in_channels=features[0],
|
||||
out_channels=features[0],
|
||||
kernel_size=4,
|
||||
stride=4,
|
||||
padding=0,
|
||||
bias=True,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
),
|
||||
)
|
||||
|
||||
pretrained.act_postprocess2 = nn.Sequential(
|
||||
readout_oper[1],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[1],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
nn.ConvTranspose2d(
|
||||
in_channels=features[1],
|
||||
out_channels=features[1],
|
||||
kernel_size=2,
|
||||
stride=2,
|
||||
padding=0,
|
||||
bias=True,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
),
|
||||
)
|
||||
else:
|
||||
pretrained.act_postprocess1 = nn.Sequential(
|
||||
nn.Identity(), nn.Identity(), nn.Identity()
|
||||
)
|
||||
pretrained.act_postprocess2 = nn.Sequential(
|
||||
nn.Identity(), nn.Identity(), nn.Identity()
|
||||
)
|
||||
|
||||
pretrained.act_postprocess3 = nn.Sequential(
|
||||
readout_oper[2],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[2],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
)
|
||||
|
||||
pretrained.act_postprocess4 = nn.Sequential(
|
||||
readout_oper[3],
|
||||
Transpose(1, 2),
|
||||
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||
nn.Conv2d(
|
||||
in_channels=vit_features,
|
||||
out_channels=features[3],
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
nn.Conv2d(
|
||||
in_channels=features[3],
|
||||
out_channels=features[3],
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
),
|
||||
)
|
||||
|
||||
pretrained.model.start_index = start_index
|
||||
pretrained.model.patch_size = [16, 16]
|
||||
|
||||
# We inject this function into the VisionTransformer instances so that
|
||||
# we can use it with interpolated position embeddings without modifying the library source.
|
||||
pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
|
||||
|
||||
# We inject this function into the VisionTransformer instances so that
|
||||
# we can use it with interpolated position embeddings without modifying the library source.
|
||||
pretrained.model._resize_pos_embed = types.MethodType(
|
||||
_resize_pos_embed, pretrained.model
|
||||
)
|
||||
|
||||
return pretrained
|
||||
|
||||
|
||||
def _make_pretrained_vitb_rn50_384(
|
||||
pretrained, use_readout="ignore", hooks=None, use_vit_only=False
|
||||
):
|
||||
model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
|
||||
|
||||
hooks = [0, 1, 8, 11] if hooks == None else hooks
|
||||
return _make_vit_b_rn50_backbone(
|
||||
model,
|
||||
features=[256, 512, 768, 768],
|
||||
size=[384, 384],
|
||||
hooks=hooks,
|
||||
use_vit_only=use_vit_only,
|
||||
use_readout=use_readout,
|
||||
)
|
||||
189
comfy/annotator/midas/utils.py
Normal file
189
comfy/annotator/midas/utils.py
Normal file
@ -0,0 +1,189 @@
|
||||
"""Utils for monoDepth."""
|
||||
import sys
|
||||
import re
|
||||
import numpy as np
|
||||
import cv2
|
||||
import torch
|
||||
|
||||
|
||||
def read_pfm(path):
|
||||
"""Read pfm file.
|
||||
|
||||
Args:
|
||||
path (str): path to file
|
||||
|
||||
Returns:
|
||||
tuple: (data, scale)
|
||||
"""
|
||||
with open(path, "rb") as file:
|
||||
|
||||
color = None
|
||||
width = None
|
||||
height = None
|
||||
scale = None
|
||||
endian = None
|
||||
|
||||
header = file.readline().rstrip()
|
||||
if header.decode("ascii") == "PF":
|
||||
color = True
|
||||
elif header.decode("ascii") == "Pf":
|
||||
color = False
|
||||
else:
|
||||
raise Exception("Not a PFM file: " + path)
|
||||
|
||||
dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
|
||||
if dim_match:
|
||||
width, height = list(map(int, dim_match.groups()))
|
||||
else:
|
||||
raise Exception("Malformed PFM header.")
|
||||
|
||||
scale = float(file.readline().decode("ascii").rstrip())
|
||||
if scale < 0:
|
||||
# little-endian
|
||||
endian = "<"
|
||||
scale = -scale
|
||||
else:
|
||||
# big-endian
|
||||
endian = ">"
|
||||
|
||||
data = np.fromfile(file, endian + "f")
|
||||
shape = (height, width, 3) if color else (height, width)
|
||||
|
||||
data = np.reshape(data, shape)
|
||||
data = np.flipud(data)
|
||||
|
||||
return data, scale
|
||||
|
||||
|
||||
def write_pfm(path, image, scale=1):
|
||||
"""Write pfm file.
|
||||
|
||||
Args:
|
||||
path (str): pathto file
|
||||
image (array): data
|
||||
scale (int, optional): Scale. Defaults to 1.
|
||||
"""
|
||||
|
||||
with open(path, "wb") as file:
|
||||
color = None
|
||||
|
||||
if image.dtype.name != "float32":
|
||||
raise Exception("Image dtype must be float32.")
|
||||
|
||||
image = np.flipud(image)
|
||||
|
||||
if len(image.shape) == 3 and image.shape[2] == 3: # color image
|
||||
color = True
|
||||
elif (
|
||||
len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
|
||||
): # greyscale
|
||||
color = False
|
||||
else:
|
||||
raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
|
||||
|
||||
file.write("PF\n" if color else "Pf\n".encode())
|
||||
file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
|
||||
|
||||
endian = image.dtype.byteorder
|
||||
|
||||
if endian == "<" or endian == "=" and sys.byteorder == "little":
|
||||
scale = -scale
|
||||
|
||||
file.write("%f\n".encode() % scale)
|
||||
|
||||
image.tofile(file)
|
||||
|
||||
|
||||
def read_image(path):
|
||||
"""Read image and output RGB image (0-1).
|
||||
|
||||
Args:
|
||||
path (str): path to file
|
||||
|
||||
Returns:
|
||||
array: RGB image (0-1)
|
||||
"""
|
||||
img = cv2.imread(path)
|
||||
|
||||
if img.ndim == 2:
|
||||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
||||
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
|
||||
|
||||
return img
|
||||
|
||||
|
||||
def resize_image(img):
|
||||
"""Resize image and make it fit for network.
|
||||
|
||||
Args:
|
||||
img (array): image
|
||||
|
||||
Returns:
|
||||
tensor: data ready for network
|
||||
"""
|
||||
height_orig = img.shape[0]
|
||||
width_orig = img.shape[1]
|
||||
|
||||
if width_orig > height_orig:
|
||||
scale = width_orig / 384
|
||||
else:
|
||||
scale = height_orig / 384
|
||||
|
||||
height = (np.ceil(height_orig / scale / 32) * 32).astype(int)
|
||||
width = (np.ceil(width_orig / scale / 32) * 32).astype(int)
|
||||
|
||||
img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA)
|
||||
|
||||
img_resized = (
|
||||
torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float()
|
||||
)
|
||||
img_resized = img_resized.unsqueeze(0)
|
||||
|
||||
return img_resized
|
||||
|
||||
|
||||
def resize_depth(depth, width, height):
|
||||
"""Resize depth map and bring to CPU (numpy).
|
||||
|
||||
Args:
|
||||
depth (tensor): depth
|
||||
width (int): image width
|
||||
height (int): image height
|
||||
|
||||
Returns:
|
||||
array: processed depth
|
||||
"""
|
||||
depth = torch.squeeze(depth[0, :, :, :]).to("cpu")
|
||||
|
||||
depth_resized = cv2.resize(
|
||||
depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC
|
||||
)
|
||||
|
||||
return depth_resized
|
||||
|
||||
def write_depth(path, depth, bits=1):
|
||||
"""Write depth map to pfm and png file.
|
||||
|
||||
Args:
|
||||
path (str): filepath without extension
|
||||
depth (array): depth
|
||||
"""
|
||||
write_pfm(path + ".pfm", depth.astype(np.float32))
|
||||
|
||||
depth_min = depth.min()
|
||||
depth_max = depth.max()
|
||||
|
||||
max_val = (2**(8*bits))-1
|
||||
|
||||
if depth_max - depth_min > np.finfo("float").eps:
|
||||
out = max_val * (depth - depth_min) / (depth_max - depth_min)
|
||||
else:
|
||||
out = np.zeros(depth.shape, dtype=depth.type)
|
||||
|
||||
if bits == 1:
|
||||
cv2.imwrite(path + ".png", out.astype("uint8"))
|
||||
elif bits == 2:
|
||||
cv2.imwrite(path + ".png", out.astype("uint16"))
|
||||
|
||||
return
|
||||
39
comfy/annotator/mlsd/__init__.py
Normal file
39
comfy/annotator/mlsd/__init__.py
Normal file
@ -0,0 +1,39 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
import os
|
||||
|
||||
from einops import rearrange
|
||||
from .models.mbv2_mlsd_tiny import MobileV2_MLSD_Tiny
|
||||
from .models.mbv2_mlsd_large import MobileV2_MLSD_Large
|
||||
from .utils import pred_lines
|
||||
|
||||
from annotator.util import annotator_ckpts_path
|
||||
|
||||
|
||||
remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/mlsd_large_512_fp32.pth"
|
||||
|
||||
|
||||
class MLSDdetector:
|
||||
def __init__(self):
|
||||
model_path = os.path.join(annotator_ckpts_path, "mlsd_large_512_fp32.pth")
|
||||
if not os.path.exists(model_path):
|
||||
from basicsr.utils.download_util import load_file_from_url
|
||||
load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
|
||||
model = MobileV2_MLSD_Large()
|
||||
model.load_state_dict(torch.load(model_path), strict=True)
|
||||
self.model = model.cuda().eval()
|
||||
|
||||
def __call__(self, input_image, thr_v, thr_d):
|
||||
assert input_image.ndim == 3
|
||||
img = input_image
|
||||
img_output = np.zeros_like(img)
|
||||
try:
|
||||
with torch.no_grad():
|
||||
lines = pred_lines(img, self.model, [img.shape[0], img.shape[1]], thr_v, thr_d)
|
||||
for line in lines:
|
||||
x_start, y_start, x_end, y_end = [int(val) for val in line]
|
||||
cv2.line(img_output, (x_start, y_start), (x_end, y_end), [255, 255, 255], 1)
|
||||
except Exception as e:
|
||||
pass
|
||||
return img_output[:, :, 0]
|
||||
292
comfy/annotator/mlsd/models/mbv2_mlsd_large.py
Normal file
292
comfy/annotator/mlsd/models/mbv2_mlsd_large.py
Normal file
@ -0,0 +1,292 @@
|
||||
import os
|
||||
import sys
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.utils.model_zoo as model_zoo
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
class BlockTypeA(nn.Module):
|
||||
def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale = True):
|
||||
super(BlockTypeA, self).__init__()
|
||||
self.conv1 = nn.Sequential(
|
||||
nn.Conv2d(in_c2, out_c2, kernel_size=1),
|
||||
nn.BatchNorm2d(out_c2),
|
||||
nn.ReLU(inplace=True)
|
||||
)
|
||||
self.conv2 = nn.Sequential(
|
||||
nn.Conv2d(in_c1, out_c1, kernel_size=1),
|
||||
nn.BatchNorm2d(out_c1),
|
||||
nn.ReLU(inplace=True)
|
||||
)
|
||||
self.upscale = upscale
|
||||
|
||||
def forward(self, a, b):
|
||||
b = self.conv1(b)
|
||||
a = self.conv2(a)
|
||||
if self.upscale:
|
||||
b = F.interpolate(b, scale_factor=2.0, mode='bilinear', align_corners=True)
|
||||
return torch.cat((a, b), dim=1)
|
||||
|
||||
|
||||
class BlockTypeB(nn.Module):
|
||||
def __init__(self, in_c, out_c):
|
||||
super(BlockTypeB, self).__init__()
|
||||
self.conv1 = nn.Sequential(
|
||||
nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
|
||||
nn.BatchNorm2d(in_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
self.conv2 = nn.Sequential(
|
||||
nn.Conv2d(in_c, out_c, kernel_size=3, padding=1),
|
||||
nn.BatchNorm2d(out_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x) + x
|
||||
x = self.conv2(x)
|
||||
return x
|
||||
|
||||
class BlockTypeC(nn.Module):
|
||||
def __init__(self, in_c, out_c):
|
||||
super(BlockTypeC, self).__init__()
|
||||
self.conv1 = nn.Sequential(
|
||||
nn.Conv2d(in_c, in_c, kernel_size=3, padding=5, dilation=5),
|
||||
nn.BatchNorm2d(in_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
self.conv2 = nn.Sequential(
|
||||
nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
|
||||
nn.BatchNorm2d(in_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.conv2(x)
|
||||
x = self.conv3(x)
|
||||
return x
|
||||
|
||||
def _make_divisible(v, divisor, min_value=None):
|
||||
"""
|
||||
This function is taken from the original tf repo.
|
||||
It ensures that all layers have a channel number that is divisible by 8
|
||||
It can be seen here:
|
||||
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
|
||||
:param v:
|
||||
:param divisor:
|
||||
:param min_value:
|
||||
:return:
|
||||
"""
|
||||
if min_value is None:
|
||||
min_value = divisor
|
||||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||
# Make sure that round down does not go down by more than 10%.
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
|
||||
|
||||
class ConvBNReLU(nn.Sequential):
|
||||
def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
|
||||
self.channel_pad = out_planes - in_planes
|
||||
self.stride = stride
|
||||
#padding = (kernel_size - 1) // 2
|
||||
|
||||
# TFLite uses slightly different padding than PyTorch
|
||||
if stride == 2:
|
||||
padding = 0
|
||||
else:
|
||||
padding = (kernel_size - 1) // 2
|
||||
|
||||
super(ConvBNReLU, self).__init__(
|
||||
nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
|
||||
nn.BatchNorm2d(out_planes),
|
||||
nn.ReLU6(inplace=True)
|
||||
)
|
||||
self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
# TFLite uses different padding
|
||||
if self.stride == 2:
|
||||
x = F.pad(x, (0, 1, 0, 1), "constant", 0)
|
||||
#print(x.shape)
|
||||
|
||||
for module in self:
|
||||
if not isinstance(module, nn.MaxPool2d):
|
||||
x = module(x)
|
||||
return x
|
||||
|
||||
|
||||
class InvertedResidual(nn.Module):
|
||||
def __init__(self, inp, oup, stride, expand_ratio):
|
||||
super(InvertedResidual, self).__init__()
|
||||
self.stride = stride
|
||||
assert stride in [1, 2]
|
||||
|
||||
hidden_dim = int(round(inp * expand_ratio))
|
||||
self.use_res_connect = self.stride == 1 and inp == oup
|
||||
|
||||
layers = []
|
||||
if expand_ratio != 1:
|
||||
# pw
|
||||
layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
|
||||
layers.extend([
|
||||
# dw
|
||||
ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
|
||||
# pw-linear
|
||||
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
|
||||
nn.BatchNorm2d(oup),
|
||||
])
|
||||
self.conv = nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
if self.use_res_connect:
|
||||
return x + self.conv(x)
|
||||
else:
|
||||
return self.conv(x)
|
||||
|
||||
|
||||
class MobileNetV2(nn.Module):
|
||||
def __init__(self, pretrained=True):
|
||||
"""
|
||||
MobileNet V2 main class
|
||||
Args:
|
||||
num_classes (int): Number of classes
|
||||
width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
|
||||
inverted_residual_setting: Network structure
|
||||
round_nearest (int): Round the number of channels in each layer to be a multiple of this number
|
||||
Set to 1 to turn off rounding
|
||||
block: Module specifying inverted residual building block for mobilenet
|
||||
"""
|
||||
super(MobileNetV2, self).__init__()
|
||||
|
||||
block = InvertedResidual
|
||||
input_channel = 32
|
||||
last_channel = 1280
|
||||
width_mult = 1.0
|
||||
round_nearest = 8
|
||||
|
||||
inverted_residual_setting = [
|
||||
# t, c, n, s
|
||||
[1, 16, 1, 1],
|
||||
[6, 24, 2, 2],
|
||||
[6, 32, 3, 2],
|
||||
[6, 64, 4, 2],
|
||||
[6, 96, 3, 1],
|
||||
#[6, 160, 3, 2],
|
||||
#[6, 320, 1, 1],
|
||||
]
|
||||
|
||||
# only check the first element, assuming user knows t,c,n,s are required
|
||||
if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
|
||||
raise ValueError("inverted_residual_setting should be non-empty "
|
||||
"or a 4-element list, got {}".format(inverted_residual_setting))
|
||||
|
||||
# building first layer
|
||||
input_channel = _make_divisible(input_channel * width_mult, round_nearest)
|
||||
self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
|
||||
features = [ConvBNReLU(4, input_channel, stride=2)]
|
||||
# building inverted residual blocks
|
||||
for t, c, n, s in inverted_residual_setting:
|
||||
output_channel = _make_divisible(c * width_mult, round_nearest)
|
||||
for i in range(n):
|
||||
stride = s if i == 0 else 1
|
||||
features.append(block(input_channel, output_channel, stride, expand_ratio=t))
|
||||
input_channel = output_channel
|
||||
|
||||
self.features = nn.Sequential(*features)
|
||||
self.fpn_selected = [1, 3, 6, 10, 13]
|
||||
# weight initialization
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
if m.bias is not None:
|
||||
nn.init.zeros_(m.bias)
|
||||
elif isinstance(m, nn.BatchNorm2d):
|
||||
nn.init.ones_(m.weight)
|
||||
nn.init.zeros_(m.bias)
|
||||
elif isinstance(m, nn.Linear):
|
||||
nn.init.normal_(m.weight, 0, 0.01)
|
||||
nn.init.zeros_(m.bias)
|
||||
if pretrained:
|
||||
self._load_pretrained_model()
|
||||
|
||||
def _forward_impl(self, x):
|
||||
# This exists since TorchScript doesn't support inheritance, so the superclass method
|
||||
# (this one) needs to have a name other than `forward` that can be accessed in a subclass
|
||||
fpn_features = []
|
||||
for i, f in enumerate(self.features):
|
||||
if i > self.fpn_selected[-1]:
|
||||
break
|
||||
x = f(x)
|
||||
if i in self.fpn_selected:
|
||||
fpn_features.append(x)
|
||||
|
||||
c1, c2, c3, c4, c5 = fpn_features
|
||||
return c1, c2, c3, c4, c5
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
return self._forward_impl(x)
|
||||
|
||||
def _load_pretrained_model(self):
|
||||
pretrain_dict = model_zoo.load_url('https://download.pytorch.org/models/mobilenet_v2-b0353104.pth')
|
||||
model_dict = {}
|
||||
state_dict = self.state_dict()
|
||||
for k, v in pretrain_dict.items():
|
||||
if k in state_dict:
|
||||
model_dict[k] = v
|
||||
state_dict.update(model_dict)
|
||||
self.load_state_dict(state_dict)
|
||||
|
||||
|
||||
class MobileV2_MLSD_Large(nn.Module):
|
||||
def __init__(self):
|
||||
super(MobileV2_MLSD_Large, self).__init__()
|
||||
|
||||
self.backbone = MobileNetV2(pretrained=False)
|
||||
## A, B
|
||||
self.block15 = BlockTypeA(in_c1= 64, in_c2= 96,
|
||||
out_c1= 64, out_c2=64,
|
||||
upscale=False)
|
||||
self.block16 = BlockTypeB(128, 64)
|
||||
|
||||
## A, B
|
||||
self.block17 = BlockTypeA(in_c1 = 32, in_c2 = 64,
|
||||
out_c1= 64, out_c2= 64)
|
||||
self.block18 = BlockTypeB(128, 64)
|
||||
|
||||
## A, B
|
||||
self.block19 = BlockTypeA(in_c1=24, in_c2=64,
|
||||
out_c1=64, out_c2=64)
|
||||
self.block20 = BlockTypeB(128, 64)
|
||||
|
||||
## A, B, C
|
||||
self.block21 = BlockTypeA(in_c1=16, in_c2=64,
|
||||
out_c1=64, out_c2=64)
|
||||
self.block22 = BlockTypeB(128, 64)
|
||||
|
||||
self.block23 = BlockTypeC(64, 16)
|
||||
|
||||
def forward(self, x):
|
||||
c1, c2, c3, c4, c5 = self.backbone(x)
|
||||
|
||||
x = self.block15(c4, c5)
|
||||
x = self.block16(x)
|
||||
|
||||
x = self.block17(c3, x)
|
||||
x = self.block18(x)
|
||||
|
||||
x = self.block19(c2, x)
|
||||
x = self.block20(x)
|
||||
|
||||
x = self.block21(c1, x)
|
||||
x = self.block22(x)
|
||||
x = self.block23(x)
|
||||
x = x[:, 7:, :, :]
|
||||
|
||||
return x
|
||||
275
comfy/annotator/mlsd/models/mbv2_mlsd_tiny.py
Normal file
275
comfy/annotator/mlsd/models/mbv2_mlsd_tiny.py
Normal file
@ -0,0 +1,275 @@
|
||||
import os
|
||||
import sys
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.utils.model_zoo as model_zoo
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
class BlockTypeA(nn.Module):
|
||||
def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale = True):
|
||||
super(BlockTypeA, self).__init__()
|
||||
self.conv1 = nn.Sequential(
|
||||
nn.Conv2d(in_c2, out_c2, kernel_size=1),
|
||||
nn.BatchNorm2d(out_c2),
|
||||
nn.ReLU(inplace=True)
|
||||
)
|
||||
self.conv2 = nn.Sequential(
|
||||
nn.Conv2d(in_c1, out_c1, kernel_size=1),
|
||||
nn.BatchNorm2d(out_c1),
|
||||
nn.ReLU(inplace=True)
|
||||
)
|
||||
self.upscale = upscale
|
||||
|
||||
def forward(self, a, b):
|
||||
b = self.conv1(b)
|
||||
a = self.conv2(a)
|
||||
b = F.interpolate(b, scale_factor=2.0, mode='bilinear', align_corners=True)
|
||||
return torch.cat((a, b), dim=1)
|
||||
|
||||
|
||||
class BlockTypeB(nn.Module):
|
||||
def __init__(self, in_c, out_c):
|
||||
super(BlockTypeB, self).__init__()
|
||||
self.conv1 = nn.Sequential(
|
||||
nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
|
||||
nn.BatchNorm2d(in_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
self.conv2 = nn.Sequential(
|
||||
nn.Conv2d(in_c, out_c, kernel_size=3, padding=1),
|
||||
nn.BatchNorm2d(out_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x) + x
|
||||
x = self.conv2(x)
|
||||
return x
|
||||
|
||||
class BlockTypeC(nn.Module):
|
||||
def __init__(self, in_c, out_c):
|
||||
super(BlockTypeC, self).__init__()
|
||||
self.conv1 = nn.Sequential(
|
||||
nn.Conv2d(in_c, in_c, kernel_size=3, padding=5, dilation=5),
|
||||
nn.BatchNorm2d(in_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
self.conv2 = nn.Sequential(
|
||||
nn.Conv2d(in_c, in_c, kernel_size=3, padding=1),
|
||||
nn.BatchNorm2d(in_c),
|
||||
nn.ReLU()
|
||||
)
|
||||
self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.conv2(x)
|
||||
x = self.conv3(x)
|
||||
return x
|
||||
|
||||
def _make_divisible(v, divisor, min_value=None):
|
||||
"""
|
||||
This function is taken from the original tf repo.
|
||||
It ensures that all layers have a channel number that is divisible by 8
|
||||
It can be seen here:
|
||||
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
|
||||
:param v:
|
||||
:param divisor:
|
||||
:param min_value:
|
||||
:return:
|
||||
"""
|
||||
if min_value is None:
|
||||
min_value = divisor
|
||||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||
# Make sure that round down does not go down by more than 10%.
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
|
||||
|
||||
class ConvBNReLU(nn.Sequential):
|
||||
def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
|
||||
self.channel_pad = out_planes - in_planes
|
||||
self.stride = stride
|
||||
#padding = (kernel_size - 1) // 2
|
||||
|
||||
# TFLite uses slightly different padding than PyTorch
|
||||
if stride == 2:
|
||||
padding = 0
|
||||
else:
|
||||
padding = (kernel_size - 1) // 2
|
||||
|
||||
super(ConvBNReLU, self).__init__(
|
||||
nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
|
||||
nn.BatchNorm2d(out_planes),
|
||||
nn.ReLU6(inplace=True)
|
||||
)
|
||||
self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
# TFLite uses different padding
|
||||
if self.stride == 2:
|
||||
x = F.pad(x, (0, 1, 0, 1), "constant", 0)
|
||||
#print(x.shape)
|
||||
|
||||
for module in self:
|
||||
if not isinstance(module, nn.MaxPool2d):
|
||||
x = module(x)
|
||||
return x
|
||||
|
||||
|
||||
class InvertedResidual(nn.Module):
|
||||
def __init__(self, inp, oup, stride, expand_ratio):
|
||||
super(InvertedResidual, self).__init__()
|
||||
self.stride = stride
|
||||
assert stride in [1, 2]
|
||||
|
||||
hidden_dim = int(round(inp * expand_ratio))
|
||||
self.use_res_connect = self.stride == 1 and inp == oup
|
||||
|
||||
layers = []
|
||||
if expand_ratio != 1:
|
||||
# pw
|
||||
layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
|
||||
layers.extend([
|
||||
# dw
|
||||
ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
|
||||
# pw-linear
|
||||
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
|
||||
nn.BatchNorm2d(oup),
|
||||
])
|
||||
self.conv = nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
if self.use_res_connect:
|
||||
return x + self.conv(x)
|
||||
else:
|
||||
return self.conv(x)
|
||||
|
||||
|
||||
class MobileNetV2(nn.Module):
|
||||
def __init__(self, pretrained=True):
|
||||
"""
|
||||
MobileNet V2 main class
|
||||
Args:
|
||||
num_classes (int): Number of classes
|
||||
width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
|
||||
inverted_residual_setting: Network structure
|
||||
round_nearest (int): Round the number of channels in each layer to be a multiple of this number
|
||||
Set to 1 to turn off rounding
|
||||
block: Module specifying inverted residual building block for mobilenet
|
||||
"""
|
||||
super(MobileNetV2, self).__init__()
|
||||
|
||||
block = InvertedResidual
|
||||
input_channel = 32
|
||||
last_channel = 1280
|
||||
width_mult = 1.0
|
||||
round_nearest = 8
|
||||
|
||||
inverted_residual_setting = [
|
||||
# t, c, n, s
|
||||
[1, 16, 1, 1],
|
||||
[6, 24, 2, 2],
|
||||
[6, 32, 3, 2],
|
||||
[6, 64, 4, 2],
|
||||
#[6, 96, 3, 1],
|
||||
#[6, 160, 3, 2],
|
||||
#[6, 320, 1, 1],
|
||||
]
|
||||
|
||||
# only check the first element, assuming user knows t,c,n,s are required
|
||||
if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
|
||||
raise ValueError("inverted_residual_setting should be non-empty "
|
||||
"or a 4-element list, got {}".format(inverted_residual_setting))
|
||||
|
||||
# building first layer
|
||||
input_channel = _make_divisible(input_channel * width_mult, round_nearest)
|
||||
self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
|
||||
features = [ConvBNReLU(4, input_channel, stride=2)]
|
||||
# building inverted residual blocks
|
||||
for t, c, n, s in inverted_residual_setting:
|
||||
output_channel = _make_divisible(c * width_mult, round_nearest)
|
||||
for i in range(n):
|
||||
stride = s if i == 0 else 1
|
||||
features.append(block(input_channel, output_channel, stride, expand_ratio=t))
|
||||
input_channel = output_channel
|
||||
self.features = nn.Sequential(*features)
|
||||
|
||||
self.fpn_selected = [3, 6, 10]
|
||||
# weight initialization
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
if m.bias is not None:
|
||||
nn.init.zeros_(m.bias)
|
||||
elif isinstance(m, nn.BatchNorm2d):
|
||||
nn.init.ones_(m.weight)
|
||||
nn.init.zeros_(m.bias)
|
||||
elif isinstance(m, nn.Linear):
|
||||
nn.init.normal_(m.weight, 0, 0.01)
|
||||
nn.init.zeros_(m.bias)
|
||||
|
||||
#if pretrained:
|
||||
# self._load_pretrained_model()
|
||||
|
||||
def _forward_impl(self, x):
|
||||
# This exists since TorchScript doesn't support inheritance, so the superclass method
|
||||
# (this one) needs to have a name other than `forward` that can be accessed in a subclass
|
||||
fpn_features = []
|
||||
for i, f in enumerate(self.features):
|
||||
if i > self.fpn_selected[-1]:
|
||||
break
|
||||
x = f(x)
|
||||
if i in self.fpn_selected:
|
||||
fpn_features.append(x)
|
||||
|
||||
c2, c3, c4 = fpn_features
|
||||
return c2, c3, c4
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
return self._forward_impl(x)
|
||||
|
||||
def _load_pretrained_model(self):
|
||||
pretrain_dict = model_zoo.load_url('https://download.pytorch.org/models/mobilenet_v2-b0353104.pth')
|
||||
model_dict = {}
|
||||
state_dict = self.state_dict()
|
||||
for k, v in pretrain_dict.items():
|
||||
if k in state_dict:
|
||||
model_dict[k] = v
|
||||
state_dict.update(model_dict)
|
||||
self.load_state_dict(state_dict)
|
||||
|
||||
|
||||
class MobileV2_MLSD_Tiny(nn.Module):
|
||||
def __init__(self):
|
||||
super(MobileV2_MLSD_Tiny, self).__init__()
|
||||
|
||||
self.backbone = MobileNetV2(pretrained=True)
|
||||
|
||||
self.block12 = BlockTypeA(in_c1= 32, in_c2= 64,
|
||||
out_c1= 64, out_c2=64)
|
||||
self.block13 = BlockTypeB(128, 64)
|
||||
|
||||
self.block14 = BlockTypeA(in_c1 = 24, in_c2 = 64,
|
||||
out_c1= 32, out_c2= 32)
|
||||
self.block15 = BlockTypeB(64, 64)
|
||||
|
||||
self.block16 = BlockTypeC(64, 16)
|
||||
|
||||
def forward(self, x):
|
||||
c2, c3, c4 = self.backbone(x)
|
||||
|
||||
x = self.block12(c3, c4)
|
||||
x = self.block13(x)
|
||||
x = self.block14(c2, x)
|
||||
x = self.block15(x)
|
||||
x = self.block16(x)
|
||||
x = x[:, 7:, :, :]
|
||||
#print(x.shape)
|
||||
x = F.interpolate(x, scale_factor=2.0, mode='bilinear', align_corners=True)
|
||||
|
||||
return x
|
||||
580
comfy/annotator/mlsd/utils.py
Normal file
580
comfy/annotator/mlsd/utils.py
Normal file
@ -0,0 +1,580 @@
|
||||
'''
|
||||
modified by lihaoweicv
|
||||
pytorch version
|
||||
'''
|
||||
|
||||
'''
|
||||
M-LSD
|
||||
Copyright 2021-present NAVER Corp.
|
||||
Apache License v2.0
|
||||
'''
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
import cv2
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
def deccode_output_score_and_ptss(tpMap, topk_n = 200, ksize = 5):
|
||||
'''
|
||||
tpMap:
|
||||
center: tpMap[1, 0, :, :]
|
||||
displacement: tpMap[1, 1:5, :, :]
|
||||
'''
|
||||
b, c, h, w = tpMap.shape
|
||||
assert b==1, 'only support bsize==1'
|
||||
displacement = tpMap[:, 1:5, :, :][0]
|
||||
center = tpMap[:, 0, :, :]
|
||||
heat = torch.sigmoid(center)
|
||||
hmax = F.max_pool2d( heat, (ksize, ksize), stride=1, padding=(ksize-1)//2)
|
||||
keep = (hmax == heat).float()
|
||||
heat = heat * keep
|
||||
heat = heat.reshape(-1, )
|
||||
|
||||
scores, indices = torch.topk(heat, topk_n, dim=-1, largest=True)
|
||||
yy = torch.floor_divide(indices, w).unsqueeze(-1)
|
||||
xx = torch.fmod(indices, w).unsqueeze(-1)
|
||||
ptss = torch.cat((yy, xx),dim=-1)
|
||||
|
||||
ptss = ptss.detach().cpu().numpy()
|
||||
scores = scores.detach().cpu().numpy()
|
||||
displacement = displacement.detach().cpu().numpy()
|
||||
displacement = displacement.transpose((1,2,0))
|
||||
return ptss, scores, displacement
|
||||
|
||||
|
||||
def pred_lines(image, model,
|
||||
input_shape=[512, 512],
|
||||
score_thr=0.10,
|
||||
dist_thr=20.0):
|
||||
h, w, _ = image.shape
|
||||
h_ratio, w_ratio = [h / input_shape[0], w / input_shape[1]]
|
||||
|
||||
resized_image = np.concatenate([cv2.resize(image, (input_shape[1], input_shape[0]), interpolation=cv2.INTER_AREA),
|
||||
np.ones([input_shape[0], input_shape[1], 1])], axis=-1)
|
||||
|
||||
resized_image = resized_image.transpose((2,0,1))
|
||||
batch_image = np.expand_dims(resized_image, axis=0).astype('float32')
|
||||
batch_image = (batch_image / 127.5) - 1.0
|
||||
|
||||
batch_image = torch.from_numpy(batch_image).float().cuda()
|
||||
outputs = model(batch_image)
|
||||
pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
|
||||
start = vmap[:, :, :2]
|
||||
end = vmap[:, :, 2:]
|
||||
dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1))
|
||||
|
||||
segments_list = []
|
||||
for center, score in zip(pts, pts_score):
|
||||
y, x = center
|
||||
distance = dist_map[y, x]
|
||||
if score > score_thr and distance > dist_thr:
|
||||
disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :]
|
||||
x_start = x + disp_x_start
|
||||
y_start = y + disp_y_start
|
||||
x_end = x + disp_x_end
|
||||
y_end = y + disp_y_end
|
||||
segments_list.append([x_start, y_start, x_end, y_end])
|
||||
|
||||
lines = 2 * np.array(segments_list) # 256 > 512
|
||||
lines[:, 0] = lines[:, 0] * w_ratio
|
||||
lines[:, 1] = lines[:, 1] * h_ratio
|
||||
lines[:, 2] = lines[:, 2] * w_ratio
|
||||
lines[:, 3] = lines[:, 3] * h_ratio
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def pred_squares(image,
|
||||
model,
|
||||
input_shape=[512, 512],
|
||||
params={'score': 0.06,
|
||||
'outside_ratio': 0.28,
|
||||
'inside_ratio': 0.45,
|
||||
'w_overlap': 0.0,
|
||||
'w_degree': 1.95,
|
||||
'w_length': 0.0,
|
||||
'w_area': 1.86,
|
||||
'w_center': 0.14}):
|
||||
'''
|
||||
shape = [height, width]
|
||||
'''
|
||||
h, w, _ = image.shape
|
||||
original_shape = [h, w]
|
||||
|
||||
resized_image = np.concatenate([cv2.resize(image, (input_shape[0], input_shape[1]), interpolation=cv2.INTER_AREA),
|
||||
np.ones([input_shape[0], input_shape[1], 1])], axis=-1)
|
||||
resized_image = resized_image.transpose((2, 0, 1))
|
||||
batch_image = np.expand_dims(resized_image, axis=0).astype('float32')
|
||||
batch_image = (batch_image / 127.5) - 1.0
|
||||
|
||||
batch_image = torch.from_numpy(batch_image).float().cuda()
|
||||
outputs = model(batch_image)
|
||||
|
||||
pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
|
||||
start = vmap[:, :, :2] # (x, y)
|
||||
end = vmap[:, :, 2:] # (x, y)
|
||||
dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1))
|
||||
|
||||
junc_list = []
|
||||
segments_list = []
|
||||
for junc, score in zip(pts, pts_score):
|
||||
y, x = junc
|
||||
distance = dist_map[y, x]
|
||||
if score > params['score'] and distance > 20.0:
|
||||
junc_list.append([x, y])
|
||||
disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :]
|
||||
d_arrow = 1.0
|
||||
x_start = x + d_arrow * disp_x_start
|
||||
y_start = y + d_arrow * disp_y_start
|
||||
x_end = x + d_arrow * disp_x_end
|
||||
y_end = y + d_arrow * disp_y_end
|
||||
segments_list.append([x_start, y_start, x_end, y_end])
|
||||
|
||||
segments = np.array(segments_list)
|
||||
|
||||
####### post processing for squares
|
||||
# 1. get unique lines
|
||||
point = np.array([[0, 0]])
|
||||
point = point[0]
|
||||
start = segments[:, :2]
|
||||
end = segments[:, 2:]
|
||||
diff = start - end
|
||||
a = diff[:, 1]
|
||||
b = -diff[:, 0]
|
||||
c = a * start[:, 0] + b * start[:, 1]
|
||||
|
||||
d = np.abs(a * point[0] + b * point[1] - c) / np.sqrt(a ** 2 + b ** 2 + 1e-10)
|
||||
theta = np.arctan2(diff[:, 0], diff[:, 1]) * 180 / np.pi
|
||||
theta[theta < 0.0] += 180
|
||||
hough = np.concatenate([d[:, None], theta[:, None]], axis=-1)
|
||||
|
||||
d_quant = 1
|
||||
theta_quant = 2
|
||||
hough[:, 0] //= d_quant
|
||||
hough[:, 1] //= theta_quant
|
||||
_, indices, counts = np.unique(hough, axis=0, return_index=True, return_counts=True)
|
||||
|
||||
acc_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1], dtype='float32')
|
||||
idx_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1], dtype='int32') - 1
|
||||
yx_indices = hough[indices, :].astype('int32')
|
||||
acc_map[yx_indices[:, 0], yx_indices[:, 1]] = counts
|
||||
idx_map[yx_indices[:, 0], yx_indices[:, 1]] = indices
|
||||
|
||||
acc_map_np = acc_map
|
||||
# acc_map = acc_map[None, :, :, None]
|
||||
#
|
||||
# ### fast suppression using tensorflow op
|
||||
# acc_map = tf.constant(acc_map, dtype=tf.float32)
|
||||
# max_acc_map = tf.keras.layers.MaxPool2D(pool_size=(5, 5), strides=1, padding='same')(acc_map)
|
||||
# acc_map = acc_map * tf.cast(tf.math.equal(acc_map, max_acc_map), tf.float32)
|
||||
# flatten_acc_map = tf.reshape(acc_map, [1, -1])
|
||||
# topk_values, topk_indices = tf.math.top_k(flatten_acc_map, k=len(pts))
|
||||
# _, h, w, _ = acc_map.shape
|
||||
# y = tf.expand_dims(topk_indices // w, axis=-1)
|
||||
# x = tf.expand_dims(topk_indices % w, axis=-1)
|
||||
# yx = tf.concat([y, x], axis=-1)
|
||||
|
||||
### fast suppression using pytorch op
|
||||
acc_map = torch.from_numpy(acc_map_np).unsqueeze(0).unsqueeze(0)
|
||||
_,_, h, w = acc_map.shape
|
||||
max_acc_map = F.max_pool2d(acc_map,kernel_size=5, stride=1, padding=2)
|
||||
acc_map = acc_map * ( (acc_map == max_acc_map).float() )
|
||||
flatten_acc_map = acc_map.reshape([-1, ])
|
||||
|
||||
scores, indices = torch.topk(flatten_acc_map, len(pts), dim=-1, largest=True)
|
||||
yy = torch.div(indices, w, rounding_mode='floor').unsqueeze(-1)
|
||||
xx = torch.fmod(indices, w).unsqueeze(-1)
|
||||
yx = torch.cat((yy, xx), dim=-1)
|
||||
|
||||
yx = yx.detach().cpu().numpy()
|
||||
|
||||
topk_values = scores.detach().cpu().numpy()
|
||||
indices = idx_map[yx[:, 0], yx[:, 1]]
|
||||
basis = 5 // 2
|
||||
|
||||
merged_segments = []
|
||||
for yx_pt, max_indice, value in zip(yx, indices, topk_values):
|
||||
y, x = yx_pt
|
||||
if max_indice == -1 or value == 0:
|
||||
continue
|
||||
segment_list = []
|
||||
for y_offset in range(-basis, basis + 1):
|
||||
for x_offset in range(-basis, basis + 1):
|
||||
indice = idx_map[y + y_offset, x + x_offset]
|
||||
cnt = int(acc_map_np[y + y_offset, x + x_offset])
|
||||
if indice != -1:
|
||||
segment_list.append(segments[indice])
|
||||
if cnt > 1:
|
||||
check_cnt = 1
|
||||
current_hough = hough[indice]
|
||||
for new_indice, new_hough in enumerate(hough):
|
||||
if (current_hough == new_hough).all() and indice != new_indice:
|
||||
segment_list.append(segments[new_indice])
|
||||
check_cnt += 1
|
||||
if check_cnt == cnt:
|
||||
break
|
||||
group_segments = np.array(segment_list).reshape([-1, 2])
|
||||
sorted_group_segments = np.sort(group_segments, axis=0)
|
||||
x_min, y_min = sorted_group_segments[0, :]
|
||||
x_max, y_max = sorted_group_segments[-1, :]
|
||||
|
||||
deg = theta[max_indice]
|
||||
if deg >= 90:
|
||||
merged_segments.append([x_min, y_max, x_max, y_min])
|
||||
else:
|
||||
merged_segments.append([x_min, y_min, x_max, y_max])
|
||||
|
||||
# 2. get intersections
|
||||
new_segments = np.array(merged_segments) # (x1, y1, x2, y2)
|
||||
start = new_segments[:, :2] # (x1, y1)
|
||||
end = new_segments[:, 2:] # (x2, y2)
|
||||
new_centers = (start + end) / 2.0
|
||||
diff = start - end
|
||||
dist_segments = np.sqrt(np.sum(diff ** 2, axis=-1))
|
||||
|
||||
# ax + by = c
|
||||
a = diff[:, 1]
|
||||
b = -diff[:, 0]
|
||||
c = a * start[:, 0] + b * start[:, 1]
|
||||
pre_det = a[:, None] * b[None, :]
|
||||
det = pre_det - np.transpose(pre_det)
|
||||
|
||||
pre_inter_y = a[:, None] * c[None, :]
|
||||
inter_y = (pre_inter_y - np.transpose(pre_inter_y)) / (det + 1e-10)
|
||||
pre_inter_x = c[:, None] * b[None, :]
|
||||
inter_x = (pre_inter_x - np.transpose(pre_inter_x)) / (det + 1e-10)
|
||||
inter_pts = np.concatenate([inter_x[:, :, None], inter_y[:, :, None]], axis=-1).astype('int32')
|
||||
|
||||
# 3. get corner information
|
||||
# 3.1 get distance
|
||||
'''
|
||||
dist_segments:
|
||||
| dist(0), dist(1), dist(2), ...|
|
||||
dist_inter_to_segment1:
|
||||
| dist(inter,0), dist(inter,0), dist(inter,0), ... |
|
||||
| dist(inter,1), dist(inter,1), dist(inter,1), ... |
|
||||
...
|
||||
dist_inter_to_semgnet2:
|
||||
| dist(inter,0), dist(inter,1), dist(inter,2), ... |
|
||||
| dist(inter,0), dist(inter,1), dist(inter,2), ... |
|
||||
...
|
||||
'''
|
||||
|
||||
dist_inter_to_segment1_start = np.sqrt(
|
||||
np.sum(((inter_pts - start[:, None, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
|
||||
dist_inter_to_segment1_end = np.sqrt(
|
||||
np.sum(((inter_pts - end[:, None, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
|
||||
dist_inter_to_segment2_start = np.sqrt(
|
||||
np.sum(((inter_pts - start[None, :, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
|
||||
dist_inter_to_segment2_end = np.sqrt(
|
||||
np.sum(((inter_pts - end[None, :, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1]
|
||||
|
||||
# sort ascending
|
||||
dist_inter_to_segment1 = np.sort(
|
||||
np.concatenate([dist_inter_to_segment1_start, dist_inter_to_segment1_end], axis=-1),
|
||||
axis=-1) # [n_batch, n_batch, 2]
|
||||
dist_inter_to_segment2 = np.sort(
|
||||
np.concatenate([dist_inter_to_segment2_start, dist_inter_to_segment2_end], axis=-1),
|
||||
axis=-1) # [n_batch, n_batch, 2]
|
||||
|
||||
# 3.2 get degree
|
||||
inter_to_start = new_centers[:, None, :] - inter_pts
|
||||
deg_inter_to_start = np.arctan2(inter_to_start[:, :, 1], inter_to_start[:, :, 0]) * 180 / np.pi
|
||||
deg_inter_to_start[deg_inter_to_start < 0.0] += 360
|
||||
inter_to_end = new_centers[None, :, :] - inter_pts
|
||||
deg_inter_to_end = np.arctan2(inter_to_end[:, :, 1], inter_to_end[:, :, 0]) * 180 / np.pi
|
||||
deg_inter_to_end[deg_inter_to_end < 0.0] += 360
|
||||
|
||||
'''
|
||||
B -- G
|
||||
| |
|
||||
C -- R
|
||||
B : blue / G: green / C: cyan / R: red
|
||||
|
||||
0 -- 1
|
||||
| |
|
||||
3 -- 2
|
||||
'''
|
||||
# rename variables
|
||||
deg1_map, deg2_map = deg_inter_to_start, deg_inter_to_end
|
||||
# sort deg ascending
|
||||
deg_sort = np.sort(np.concatenate([deg1_map[:, :, None], deg2_map[:, :, None]], axis=-1), axis=-1)
|
||||
|
||||
deg_diff_map = np.abs(deg1_map - deg2_map)
|
||||
# we only consider the smallest degree of intersect
|
||||
deg_diff_map[deg_diff_map > 180] = 360 - deg_diff_map[deg_diff_map > 180]
|
||||
|
||||
# define available degree range
|
||||
deg_range = [60, 120]
|
||||
|
||||
corner_dict = {corner_info: [] for corner_info in range(4)}
|
||||
inter_points = []
|
||||
for i in range(inter_pts.shape[0]):
|
||||
for j in range(i + 1, inter_pts.shape[1]):
|
||||
# i, j > line index, always i < j
|
||||
x, y = inter_pts[i, j, :]
|
||||
deg1, deg2 = deg_sort[i, j, :]
|
||||
deg_diff = deg_diff_map[i, j]
|
||||
|
||||
check_degree = deg_diff > deg_range[0] and deg_diff < deg_range[1]
|
||||
|
||||
outside_ratio = params['outside_ratio'] # over ratio >>> drop it!
|
||||
inside_ratio = params['inside_ratio'] # over ratio >>> drop it!
|
||||
check_distance = ((dist_inter_to_segment1[i, j, 1] >= dist_segments[i] and \
|
||||
dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * outside_ratio) or \
|
||||
(dist_inter_to_segment1[i, j, 1] <= dist_segments[i] and \
|
||||
dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * inside_ratio)) and \
|
||||
((dist_inter_to_segment2[i, j, 1] >= dist_segments[j] and \
|
||||
dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * outside_ratio) or \
|
||||
(dist_inter_to_segment2[i, j, 1] <= dist_segments[j] and \
|
||||
dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * inside_ratio))
|
||||
|
||||
if check_degree and check_distance:
|
||||
corner_info = None
|
||||
|
||||
if (deg1 >= 0 and deg1 <= 45 and deg2 >= 45 and deg2 <= 120) or \
|
||||
(deg2 >= 315 and deg1 >= 45 and deg1 <= 120):
|
||||
corner_info, color_info = 0, 'blue'
|
||||
elif (deg1 >= 45 and deg1 <= 125 and deg2 >= 125 and deg2 <= 225):
|
||||
corner_info, color_info = 1, 'green'
|
||||
elif (deg1 >= 125 and deg1 <= 225 and deg2 >= 225 and deg2 <= 315):
|
||||
corner_info, color_info = 2, 'black'
|
||||
elif (deg1 >= 0 and deg1 <= 45 and deg2 >= 225 and deg2 <= 315) or \
|
||||
(deg2 >= 315 and deg1 >= 225 and deg1 <= 315):
|
||||
corner_info, color_info = 3, 'cyan'
|
||||
else:
|
||||
corner_info, color_info = 4, 'red' # we don't use it
|
||||
continue
|
||||
|
||||
corner_dict[corner_info].append([x, y, i, j])
|
||||
inter_points.append([x, y])
|
||||
|
||||
square_list = []
|
||||
connect_list = []
|
||||
segments_list = []
|
||||
for corner0 in corner_dict[0]:
|
||||
for corner1 in corner_dict[1]:
|
||||
connect01 = False
|
||||
for corner0_line in corner0[2:]:
|
||||
if corner0_line in corner1[2:]:
|
||||
connect01 = True
|
||||
break
|
||||
if connect01:
|
||||
for corner2 in corner_dict[2]:
|
||||
connect12 = False
|
||||
for corner1_line in corner1[2:]:
|
||||
if corner1_line in corner2[2:]:
|
||||
connect12 = True
|
||||
break
|
||||
if connect12:
|
||||
for corner3 in corner_dict[3]:
|
||||
connect23 = False
|
||||
for corner2_line in corner2[2:]:
|
||||
if corner2_line in corner3[2:]:
|
||||
connect23 = True
|
||||
break
|
||||
if connect23:
|
||||
for corner3_line in corner3[2:]:
|
||||
if corner3_line in corner0[2:]:
|
||||
# SQUARE!!!
|
||||
'''
|
||||
0 -- 1
|
||||
| |
|
||||
3 -- 2
|
||||
square_list:
|
||||
order: 0 > 1 > 2 > 3
|
||||
| x0, y0, x1, y1, x2, y2, x3, y3 |
|
||||
| x0, y0, x1, y1, x2, y2, x3, y3 |
|
||||
...
|
||||
connect_list:
|
||||
order: 01 > 12 > 23 > 30
|
||||
| line_idx01, line_idx12, line_idx23, line_idx30 |
|
||||
| line_idx01, line_idx12, line_idx23, line_idx30 |
|
||||
...
|
||||
segments_list:
|
||||
order: 0 > 1 > 2 > 3
|
||||
| line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i, line_idx2_j, line_idx3_i, line_idx3_j |
|
||||
| line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i, line_idx2_j, line_idx3_i, line_idx3_j |
|
||||
...
|
||||
'''
|
||||
square_list.append(corner0[:2] + corner1[:2] + corner2[:2] + corner3[:2])
|
||||
connect_list.append([corner0_line, corner1_line, corner2_line, corner3_line])
|
||||
segments_list.append(corner0[2:] + corner1[2:] + corner2[2:] + corner3[2:])
|
||||
|
||||
def check_outside_inside(segments_info, connect_idx):
|
||||
# return 'outside or inside', min distance, cover_param, peri_param
|
||||
if connect_idx == segments_info[0]:
|
||||
check_dist_mat = dist_inter_to_segment1
|
||||
else:
|
||||
check_dist_mat = dist_inter_to_segment2
|
||||
|
||||
i, j = segments_info
|
||||
min_dist, max_dist = check_dist_mat[i, j, :]
|
||||
connect_dist = dist_segments[connect_idx]
|
||||
if max_dist > connect_dist:
|
||||
return 'outside', min_dist, 0, 1
|
||||
else:
|
||||
return 'inside', min_dist, -1, -1
|
||||
|
||||
top_square = None
|
||||
|
||||
try:
|
||||
map_size = input_shape[0] / 2
|
||||
squares = np.array(square_list).reshape([-1, 4, 2])
|
||||
score_array = []
|
||||
connect_array = np.array(connect_list)
|
||||
segments_array = np.array(segments_list).reshape([-1, 4, 2])
|
||||
|
||||
# get degree of corners:
|
||||
squares_rollup = np.roll(squares, 1, axis=1)
|
||||
squares_rolldown = np.roll(squares, -1, axis=1)
|
||||
vec1 = squares_rollup - squares
|
||||
normalized_vec1 = vec1 / (np.linalg.norm(vec1, axis=-1, keepdims=True) + 1e-10)
|
||||
vec2 = squares_rolldown - squares
|
||||
normalized_vec2 = vec2 / (np.linalg.norm(vec2, axis=-1, keepdims=True) + 1e-10)
|
||||
inner_products = np.sum(normalized_vec1 * normalized_vec2, axis=-1) # [n_squares, 4]
|
||||
squares_degree = np.arccos(inner_products) * 180 / np.pi # [n_squares, 4]
|
||||
|
||||
# get square score
|
||||
overlap_scores = []
|
||||
degree_scores = []
|
||||
length_scores = []
|
||||
|
||||
for connects, segments, square, degree in zip(connect_array, segments_array, squares, squares_degree):
|
||||
'''
|
||||
0 -- 1
|
||||
| |
|
||||
3 -- 2
|
||||
|
||||
# segments: [4, 2]
|
||||
# connects: [4]
|
||||
'''
|
||||
|
||||
###################################### OVERLAP SCORES
|
||||
cover = 0
|
||||
perimeter = 0
|
||||
# check 0 > 1 > 2 > 3
|
||||
square_length = []
|
||||
|
||||
for start_idx in range(4):
|
||||
end_idx = (start_idx + 1) % 4
|
||||
|
||||
connect_idx = connects[start_idx] # segment idx of segment01
|
||||
start_segments = segments[start_idx]
|
||||
end_segments = segments[end_idx]
|
||||
|
||||
start_point = square[start_idx]
|
||||
end_point = square[end_idx]
|
||||
|
||||
# check whether outside or inside
|
||||
start_position, start_min, start_cover_param, start_peri_param = check_outside_inside(start_segments,
|
||||
connect_idx)
|
||||
end_position, end_min, end_cover_param, end_peri_param = check_outside_inside(end_segments, connect_idx)
|
||||
|
||||
cover += dist_segments[connect_idx] + start_cover_param * start_min + end_cover_param * end_min
|
||||
perimeter += dist_segments[connect_idx] + start_peri_param * start_min + end_peri_param * end_min
|
||||
|
||||
square_length.append(
|
||||
dist_segments[connect_idx] + start_peri_param * start_min + end_peri_param * end_min)
|
||||
|
||||
overlap_scores.append(cover / perimeter)
|
||||
######################################
|
||||
###################################### DEGREE SCORES
|
||||
'''
|
||||
deg0 vs deg2
|
||||
deg1 vs deg3
|
||||
'''
|
||||
deg0, deg1, deg2, deg3 = degree
|
||||
deg_ratio1 = deg0 / deg2
|
||||
if deg_ratio1 > 1.0:
|
||||
deg_ratio1 = 1 / deg_ratio1
|
||||
deg_ratio2 = deg1 / deg3
|
||||
if deg_ratio2 > 1.0:
|
||||
deg_ratio2 = 1 / deg_ratio2
|
||||
degree_scores.append((deg_ratio1 + deg_ratio2) / 2)
|
||||
######################################
|
||||
###################################### LENGTH SCORES
|
||||
'''
|
||||
len0 vs len2
|
||||
len1 vs len3
|
||||
'''
|
||||
len0, len1, len2, len3 = square_length
|
||||
len_ratio1 = len0 / len2 if len2 > len0 else len2 / len0
|
||||
len_ratio2 = len1 / len3 if len3 > len1 else len3 / len1
|
||||
length_scores.append((len_ratio1 + len_ratio2) / 2)
|
||||
|
||||
######################################
|
||||
|
||||
overlap_scores = np.array(overlap_scores)
|
||||
overlap_scores /= np.max(overlap_scores)
|
||||
|
||||
degree_scores = np.array(degree_scores)
|
||||
# degree_scores /= np.max(degree_scores)
|
||||
|
||||
length_scores = np.array(length_scores)
|
||||
|
||||
###################################### AREA SCORES
|
||||
area_scores = np.reshape(squares, [-1, 4, 2])
|
||||
area_x = area_scores[:, :, 0]
|
||||
area_y = area_scores[:, :, 1]
|
||||
correction = area_x[:, -1] * area_y[:, 0] - area_y[:, -1] * area_x[:, 0]
|
||||
area_scores = np.sum(area_x[:, :-1] * area_y[:, 1:], axis=-1) - np.sum(area_y[:, :-1] * area_x[:, 1:], axis=-1)
|
||||
area_scores = 0.5 * np.abs(area_scores + correction)
|
||||
area_scores /= (map_size * map_size) # np.max(area_scores)
|
||||
######################################
|
||||
|
||||
###################################### CENTER SCORES
|
||||
centers = np.array([[256 // 2, 256 // 2]], dtype='float32') # [1, 2]
|
||||
# squares: [n, 4, 2]
|
||||
square_centers = np.mean(squares, axis=1) # [n, 2]
|
||||
center2center = np.sqrt(np.sum((centers - square_centers) ** 2))
|
||||
center_scores = center2center / (map_size / np.sqrt(2.0))
|
||||
|
||||
'''
|
||||
score_w = [overlap, degree, area, center, length]
|
||||
'''
|
||||
score_w = [0.0, 1.0, 10.0, 0.5, 1.0]
|
||||
score_array = params['w_overlap'] * overlap_scores \
|
||||
+ params['w_degree'] * degree_scores \
|
||||
+ params['w_area'] * area_scores \
|
||||
- params['w_center'] * center_scores \
|
||||
+ params['w_length'] * length_scores
|
||||
|
||||
best_square = []
|
||||
|
||||
sorted_idx = np.argsort(score_array)[::-1]
|
||||
score_array = score_array[sorted_idx]
|
||||
squares = squares[sorted_idx]
|
||||
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
'''return list
|
||||
merged_lines, squares, scores
|
||||
'''
|
||||
|
||||
try:
|
||||
new_segments[:, 0] = new_segments[:, 0] * 2 / input_shape[1] * original_shape[1]
|
||||
new_segments[:, 1] = new_segments[:, 1] * 2 / input_shape[0] * original_shape[0]
|
||||
new_segments[:, 2] = new_segments[:, 2] * 2 / input_shape[1] * original_shape[1]
|
||||
new_segments[:, 3] = new_segments[:, 3] * 2 / input_shape[0] * original_shape[0]
|
||||
except:
|
||||
new_segments = []
|
||||
|
||||
try:
|
||||
squares[:, :, 0] = squares[:, :, 0] * 2 / input_shape[1] * original_shape[1]
|
||||
squares[:, :, 1] = squares[:, :, 1] * 2 / input_shape[0] * original_shape[0]
|
||||
except:
|
||||
squares = []
|
||||
score_array = []
|
||||
|
||||
try:
|
||||
inter_points = np.array(inter_points)
|
||||
inter_points[:, 0] = inter_points[:, 0] * 2 / input_shape[1] * original_shape[1]
|
||||
inter_points[:, 1] = inter_points[:, 1] * 2 / input_shape[0] * original_shape[0]
|
||||
except:
|
||||
inter_points = []
|
||||
|
||||
return new_segments, squares, score_array, inter_points
|
||||
44
comfy/annotator/openpose/__init__.py
Normal file
44
comfy/annotator/openpose/__init__.py
Normal file
@ -0,0 +1,44 @@
|
||||
import os
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from . import util
|
||||
from .body import Body
|
||||
from .hand import Hand
|
||||
from annotator.util import annotator_ckpts_path
|
||||
|
||||
|
||||
body_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/body_pose_model.pth"
|
||||
hand_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/hand_pose_model.pth"
|
||||
|
||||
|
||||
class OpenposeDetector:
|
||||
def __init__(self):
|
||||
body_modelpath = os.path.join(annotator_ckpts_path, "body_pose_model.pth")
|
||||
hand_modelpath = os.path.join(annotator_ckpts_path, "hand_pose_model.pth")
|
||||
|
||||
if not os.path.exists(hand_modelpath):
|
||||
from basicsr.utils.download_util import load_file_from_url
|
||||
load_file_from_url(body_model_path, model_dir=annotator_ckpts_path)
|
||||
load_file_from_url(hand_model_path, model_dir=annotator_ckpts_path)
|
||||
|
||||
self.body_estimation = Body(body_modelpath)
|
||||
self.hand_estimation = Hand(hand_modelpath)
|
||||
|
||||
def __call__(self, oriImg, hand=False):
|
||||
oriImg = oriImg[:, :, ::-1].copy()
|
||||
with torch.no_grad():
|
||||
candidate, subset = self.body_estimation(oriImg)
|
||||
canvas = np.zeros_like(oriImg)
|
||||
canvas = util.draw_bodypose(canvas, candidate, subset)
|
||||
if hand:
|
||||
hands_list = util.handDetect(candidate, subset, oriImg)
|
||||
all_hand_peaks = []
|
||||
for x, y, w, is_left in hands_list:
|
||||
peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :])
|
||||
peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x)
|
||||
peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y)
|
||||
all_hand_peaks.append(peaks)
|
||||
canvas = util.draw_handpose(canvas, all_hand_peaks)
|
||||
return canvas, dict(candidate=candidate.tolist(), subset=subset.tolist())
|
||||
219
comfy/annotator/openpose/body.py
Normal file
219
comfy/annotator/openpose/body.py
Normal file
@ -0,0 +1,219 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import math
|
||||
import time
|
||||
from scipy.ndimage.filters import gaussian_filter
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib
|
||||
import torch
|
||||
from torchvision import transforms
|
||||
|
||||
from . import util
|
||||
from .model import bodypose_model
|
||||
|
||||
class Body(object):
|
||||
def __init__(self, model_path):
|
||||
self.model = bodypose_model()
|
||||
if torch.cuda.is_available():
|
||||
self.model = self.model.cuda()
|
||||
print('cuda')
|
||||
model_dict = util.transfer(self.model, torch.load(model_path))
|
||||
self.model.load_state_dict(model_dict)
|
||||
self.model.eval()
|
||||
|
||||
def __call__(self, oriImg):
|
||||
# scale_search = [0.5, 1.0, 1.5, 2.0]
|
||||
scale_search = [0.5]
|
||||
boxsize = 368
|
||||
stride = 8
|
||||
padValue = 128
|
||||
thre1 = 0.1
|
||||
thre2 = 0.05
|
||||
multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
|
||||
heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
|
||||
paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
|
||||
|
||||
for m in range(len(multiplier)):
|
||||
scale = multiplier[m]
|
||||
imageToTest = cv2.resize(oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
|
||||
imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
|
||||
im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
|
||||
im = np.ascontiguousarray(im)
|
||||
|
||||
data = torch.from_numpy(im).float()
|
||||
if torch.cuda.is_available():
|
||||
data = data.cuda()
|
||||
# data = data.permute([2, 0, 1]).unsqueeze(0).float()
|
||||
with torch.no_grad():
|
||||
Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
|
||||
Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
|
||||
Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
|
||||
|
||||
# extract outputs, resize, and remove padding
|
||||
# heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0)) # output 1 is heatmaps
|
||||
heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0)) # output 1 is heatmaps
|
||||
heatmap = cv2.resize(heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
|
||||
heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
|
||||
heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
# paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0)) # output 0 is PAFs
|
||||
paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0)) # output 0 is PAFs
|
||||
paf = cv2.resize(paf, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
|
||||
paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
|
||||
paf = cv2.resize(paf, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
heatmap_avg += heatmap_avg + heatmap / len(multiplier)
|
||||
paf_avg += + paf / len(multiplier)
|
||||
|
||||
all_peaks = []
|
||||
peak_counter = 0
|
||||
|
||||
for part in range(18):
|
||||
map_ori = heatmap_avg[:, :, part]
|
||||
one_heatmap = gaussian_filter(map_ori, sigma=3)
|
||||
|
||||
map_left = np.zeros(one_heatmap.shape)
|
||||
map_left[1:, :] = one_heatmap[:-1, :]
|
||||
map_right = np.zeros(one_heatmap.shape)
|
||||
map_right[:-1, :] = one_heatmap[1:, :]
|
||||
map_up = np.zeros(one_heatmap.shape)
|
||||
map_up[:, 1:] = one_heatmap[:, :-1]
|
||||
map_down = np.zeros(one_heatmap.shape)
|
||||
map_down[:, :-1] = one_heatmap[:, 1:]
|
||||
|
||||
peaks_binary = np.logical_and.reduce(
|
||||
(one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1))
|
||||
peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])) # note reverse
|
||||
peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
|
||||
peak_id = range(peak_counter, peak_counter + len(peaks))
|
||||
peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))]
|
||||
|
||||
all_peaks.append(peaks_with_score_and_id)
|
||||
peak_counter += len(peaks)
|
||||
|
||||
# find connection in the specified sequence, center 29 is in the position 15
|
||||
limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
|
||||
[10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
|
||||
[1, 16], [16, 18], [3, 17], [6, 18]]
|
||||
# the middle joints heatmap correpondence
|
||||
mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \
|
||||
[23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \
|
||||
[55, 56], [37, 38], [45, 46]]
|
||||
|
||||
connection_all = []
|
||||
special_k = []
|
||||
mid_num = 10
|
||||
|
||||
for k in range(len(mapIdx)):
|
||||
score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
|
||||
candA = all_peaks[limbSeq[k][0] - 1]
|
||||
candB = all_peaks[limbSeq[k][1] - 1]
|
||||
nA = len(candA)
|
||||
nB = len(candB)
|
||||
indexA, indexB = limbSeq[k]
|
||||
if (nA != 0 and nB != 0):
|
||||
connection_candidate = []
|
||||
for i in range(nA):
|
||||
for j in range(nB):
|
||||
vec = np.subtract(candB[j][:2], candA[i][:2])
|
||||
norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
|
||||
norm = max(0.001, norm)
|
||||
vec = np.divide(vec, norm)
|
||||
|
||||
startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \
|
||||
np.linspace(candA[i][1], candB[j][1], num=mid_num)))
|
||||
|
||||
vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \
|
||||
for I in range(len(startend))])
|
||||
vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \
|
||||
for I in range(len(startend))])
|
||||
|
||||
score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
|
||||
score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
|
||||
0.5 * oriImg.shape[0] / norm - 1, 0)
|
||||
criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
|
||||
criterion2 = score_with_dist_prior > 0
|
||||
if criterion1 and criterion2:
|
||||
connection_candidate.append(
|
||||
[i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]])
|
||||
|
||||
connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
|
||||
connection = np.zeros((0, 5))
|
||||
for c in range(len(connection_candidate)):
|
||||
i, j, s = connection_candidate[c][0:3]
|
||||
if (i not in connection[:, 3] and j not in connection[:, 4]):
|
||||
connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]])
|
||||
if (len(connection) >= min(nA, nB)):
|
||||
break
|
||||
|
||||
connection_all.append(connection)
|
||||
else:
|
||||
special_k.append(k)
|
||||
connection_all.append([])
|
||||
|
||||
# last number in each row is the total parts number of that person
|
||||
# the second last number in each row is the score of the overall configuration
|
||||
subset = -1 * np.ones((0, 20))
|
||||
candidate = np.array([item for sublist in all_peaks for item in sublist])
|
||||
|
||||
for k in range(len(mapIdx)):
|
||||
if k not in special_k:
|
||||
partAs = connection_all[k][:, 0]
|
||||
partBs = connection_all[k][:, 1]
|
||||
indexA, indexB = np.array(limbSeq[k]) - 1
|
||||
|
||||
for i in range(len(connection_all[k])): # = 1:size(temp,1)
|
||||
found = 0
|
||||
subset_idx = [-1, -1]
|
||||
for j in range(len(subset)): # 1:size(subset,1):
|
||||
if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]:
|
||||
subset_idx[found] = j
|
||||
found += 1
|
||||
|
||||
if found == 1:
|
||||
j = subset_idx[0]
|
||||
if subset[j][indexB] != partBs[i]:
|
||||
subset[j][indexB] = partBs[i]
|
||||
subset[j][-1] += 1
|
||||
subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
|
||||
elif found == 2: # if found 2 and disjoint, merge them
|
||||
j1, j2 = subset_idx
|
||||
membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
|
||||
if len(np.nonzero(membership == 2)[0]) == 0: # merge
|
||||
subset[j1][:-2] += (subset[j2][:-2] + 1)
|
||||
subset[j1][-2:] += subset[j2][-2:]
|
||||
subset[j1][-2] += connection_all[k][i][2]
|
||||
subset = np.delete(subset, j2, 0)
|
||||
else: # as like found == 1
|
||||
subset[j1][indexB] = partBs[i]
|
||||
subset[j1][-1] += 1
|
||||
subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
|
||||
|
||||
# if find no partA in the subset, create a new subset
|
||||
elif not found and k < 17:
|
||||
row = -1 * np.ones(20)
|
||||
row[indexA] = partAs[i]
|
||||
row[indexB] = partBs[i]
|
||||
row[-1] = 2
|
||||
row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
|
||||
subset = np.vstack([subset, row])
|
||||
# delete some rows of subset which has few parts occur
|
||||
deleteIdx = []
|
||||
for i in range(len(subset)):
|
||||
if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
|
||||
deleteIdx.append(i)
|
||||
subset = np.delete(subset, deleteIdx, axis=0)
|
||||
|
||||
# subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
|
||||
# candidate: x, y, score, id
|
||||
return candidate, subset
|
||||
|
||||
if __name__ == "__main__":
|
||||
body_estimation = Body('../model/body_pose_model.pth')
|
||||
|
||||
test_image = '../images/ski.jpg'
|
||||
oriImg = cv2.imread(test_image) # B,G,R order
|
||||
candidate, subset = body_estimation(oriImg)
|
||||
canvas = util.draw_bodypose(oriImg, candidate, subset)
|
||||
plt.imshow(canvas[:, :, [2, 1, 0]])
|
||||
plt.show()
|
||||
86
comfy/annotator/openpose/hand.py
Normal file
86
comfy/annotator/openpose/hand.py
Normal file
@ -0,0 +1,86 @@
|
||||
import cv2
|
||||
import json
|
||||
import numpy as np
|
||||
import math
|
||||
import time
|
||||
from scipy.ndimage.filters import gaussian_filter
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib
|
||||
import torch
|
||||
from skimage.measure import label
|
||||
|
||||
from .model import handpose_model
|
||||
from . import util
|
||||
|
||||
class Hand(object):
|
||||
def __init__(self, model_path):
|
||||
self.model = handpose_model()
|
||||
if torch.cuda.is_available():
|
||||
self.model = self.model.cuda()
|
||||
print('cuda')
|
||||
model_dict = util.transfer(self.model, torch.load(model_path))
|
||||
self.model.load_state_dict(model_dict)
|
||||
self.model.eval()
|
||||
|
||||
def __call__(self, oriImg):
|
||||
scale_search = [0.5, 1.0, 1.5, 2.0]
|
||||
# scale_search = [0.5]
|
||||
boxsize = 368
|
||||
stride = 8
|
||||
padValue = 128
|
||||
thre = 0.05
|
||||
multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
|
||||
heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 22))
|
||||
# paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
|
||||
|
||||
for m in range(len(multiplier)):
|
||||
scale = multiplier[m]
|
||||
imageToTest = cv2.resize(oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
|
||||
imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
|
||||
im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
|
||||
im = np.ascontiguousarray(im)
|
||||
|
||||
data = torch.from_numpy(im).float()
|
||||
if torch.cuda.is_available():
|
||||
data = data.cuda()
|
||||
# data = data.permute([2, 0, 1]).unsqueeze(0).float()
|
||||
with torch.no_grad():
|
||||
output = self.model(data).cpu().numpy()
|
||||
# output = self.model(data).numpy()q
|
||||
|
||||
# extract outputs, resize, and remove padding
|
||||
heatmap = np.transpose(np.squeeze(output), (1, 2, 0)) # output 1 is heatmaps
|
||||
heatmap = cv2.resize(heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
|
||||
heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
|
||||
heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
heatmap_avg += heatmap / len(multiplier)
|
||||
|
||||
all_peaks = []
|
||||
for part in range(21):
|
||||
map_ori = heatmap_avg[:, :, part]
|
||||
one_heatmap = gaussian_filter(map_ori, sigma=3)
|
||||
binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
|
||||
# 全部小于阈值
|
||||
if np.sum(binary) == 0:
|
||||
all_peaks.append([0, 0])
|
||||
continue
|
||||
label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim)
|
||||
max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1
|
||||
label_img[label_img != max_index] = 0
|
||||
map_ori[label_img == 0] = 0
|
||||
|
||||
y, x = util.npmax(map_ori)
|
||||
all_peaks.append([x, y])
|
||||
return np.array(all_peaks)
|
||||
|
||||
if __name__ == "__main__":
|
||||
hand_estimation = Hand('../model/hand_pose_model.pth')
|
||||
|
||||
# test_image = '../images/hand.jpg'
|
||||
test_image = '../images/hand.jpg'
|
||||
oriImg = cv2.imread(test_image) # B,G,R order
|
||||
peaks = hand_estimation(oriImg)
|
||||
canvas = util.draw_handpose(oriImg, peaks, True)
|
||||
cv2.imshow('', canvas)
|
||||
cv2.waitKey(0)
|
||||
219
comfy/annotator/openpose/model.py
Normal file
219
comfy/annotator/openpose/model.py
Normal file
@ -0,0 +1,219 @@
|
||||
import torch
|
||||
from collections import OrderedDict
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
def make_layers(block, no_relu_layers):
|
||||
layers = []
|
||||
for layer_name, v in block.items():
|
||||
if 'pool' in layer_name:
|
||||
layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
|
||||
padding=v[2])
|
||||
layers.append((layer_name, layer))
|
||||
else:
|
||||
conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
|
||||
kernel_size=v[2], stride=v[3],
|
||||
padding=v[4])
|
||||
layers.append((layer_name, conv2d))
|
||||
if layer_name not in no_relu_layers:
|
||||
layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
|
||||
|
||||
return nn.Sequential(OrderedDict(layers))
|
||||
|
||||
class bodypose_model(nn.Module):
|
||||
def __init__(self):
|
||||
super(bodypose_model, self).__init__()
|
||||
|
||||
# these layers have no relu layer
|
||||
no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
|
||||
'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
|
||||
'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
|
||||
'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
|
||||
blocks = {}
|
||||
block0 = OrderedDict([
|
||||
('conv1_1', [3, 64, 3, 1, 1]),
|
||||
('conv1_2', [64, 64, 3, 1, 1]),
|
||||
('pool1_stage1', [2, 2, 0]),
|
||||
('conv2_1', [64, 128, 3, 1, 1]),
|
||||
('conv2_2', [128, 128, 3, 1, 1]),
|
||||
('pool2_stage1', [2, 2, 0]),
|
||||
('conv3_1', [128, 256, 3, 1, 1]),
|
||||
('conv3_2', [256, 256, 3, 1, 1]),
|
||||
('conv3_3', [256, 256, 3, 1, 1]),
|
||||
('conv3_4', [256, 256, 3, 1, 1]),
|
||||
('pool3_stage1', [2, 2, 0]),
|
||||
('conv4_1', [256, 512, 3, 1, 1]),
|
||||
('conv4_2', [512, 512, 3, 1, 1]),
|
||||
('conv4_3_CPM', [512, 256, 3, 1, 1]),
|
||||
('conv4_4_CPM', [256, 128, 3, 1, 1])
|
||||
])
|
||||
|
||||
|
||||
# Stage 1
|
||||
block1_1 = OrderedDict([
|
||||
('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
|
||||
('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
|
||||
('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
|
||||
('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
|
||||
('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
|
||||
])
|
||||
|
||||
block1_2 = OrderedDict([
|
||||
('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
|
||||
('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
|
||||
('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
|
||||
('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
|
||||
('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
|
||||
])
|
||||
blocks['block1_1'] = block1_1
|
||||
blocks['block1_2'] = block1_2
|
||||
|
||||
self.model0 = make_layers(block0, no_relu_layers)
|
||||
|
||||
# Stages 2 - 6
|
||||
for i in range(2, 7):
|
||||
blocks['block%d_1' % i] = OrderedDict([
|
||||
('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
|
||||
('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
|
||||
('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
|
||||
])
|
||||
|
||||
blocks['block%d_2' % i] = OrderedDict([
|
||||
('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
|
||||
('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
|
||||
('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
|
||||
])
|
||||
|
||||
for k in blocks.keys():
|
||||
blocks[k] = make_layers(blocks[k], no_relu_layers)
|
||||
|
||||
self.model1_1 = blocks['block1_1']
|
||||
self.model2_1 = blocks['block2_1']
|
||||
self.model3_1 = blocks['block3_1']
|
||||
self.model4_1 = blocks['block4_1']
|
||||
self.model5_1 = blocks['block5_1']
|
||||
self.model6_1 = blocks['block6_1']
|
||||
|
||||
self.model1_2 = blocks['block1_2']
|
||||
self.model2_2 = blocks['block2_2']
|
||||
self.model3_2 = blocks['block3_2']
|
||||
self.model4_2 = blocks['block4_2']
|
||||
self.model5_2 = blocks['block5_2']
|
||||
self.model6_2 = blocks['block6_2']
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
out1 = self.model0(x)
|
||||
|
||||
out1_1 = self.model1_1(out1)
|
||||
out1_2 = self.model1_2(out1)
|
||||
out2 = torch.cat([out1_1, out1_2, out1], 1)
|
||||
|
||||
out2_1 = self.model2_1(out2)
|
||||
out2_2 = self.model2_2(out2)
|
||||
out3 = torch.cat([out2_1, out2_2, out1], 1)
|
||||
|
||||
out3_1 = self.model3_1(out3)
|
||||
out3_2 = self.model3_2(out3)
|
||||
out4 = torch.cat([out3_1, out3_2, out1], 1)
|
||||
|
||||
out4_1 = self.model4_1(out4)
|
||||
out4_2 = self.model4_2(out4)
|
||||
out5 = torch.cat([out4_1, out4_2, out1], 1)
|
||||
|
||||
out5_1 = self.model5_1(out5)
|
||||
out5_2 = self.model5_2(out5)
|
||||
out6 = torch.cat([out5_1, out5_2, out1], 1)
|
||||
|
||||
out6_1 = self.model6_1(out6)
|
||||
out6_2 = self.model6_2(out6)
|
||||
|
||||
return out6_1, out6_2
|
||||
|
||||
class handpose_model(nn.Module):
|
||||
def __init__(self):
|
||||
super(handpose_model, self).__init__()
|
||||
|
||||
# these layers have no relu layer
|
||||
no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
|
||||
'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
|
||||
# stage 1
|
||||
block1_0 = OrderedDict([
|
||||
('conv1_1', [3, 64, 3, 1, 1]),
|
||||
('conv1_2', [64, 64, 3, 1, 1]),
|
||||
('pool1_stage1', [2, 2, 0]),
|
||||
('conv2_1', [64, 128, 3, 1, 1]),
|
||||
('conv2_2', [128, 128, 3, 1, 1]),
|
||||
('pool2_stage1', [2, 2, 0]),
|
||||
('conv3_1', [128, 256, 3, 1, 1]),
|
||||
('conv3_2', [256, 256, 3, 1, 1]),
|
||||
('conv3_3', [256, 256, 3, 1, 1]),
|
||||
('conv3_4', [256, 256, 3, 1, 1]),
|
||||
('pool3_stage1', [2, 2, 0]),
|
||||
('conv4_1', [256, 512, 3, 1, 1]),
|
||||
('conv4_2', [512, 512, 3, 1, 1]),
|
||||
('conv4_3', [512, 512, 3, 1, 1]),
|
||||
('conv4_4', [512, 512, 3, 1, 1]),
|
||||
('conv5_1', [512, 512, 3, 1, 1]),
|
||||
('conv5_2', [512, 512, 3, 1, 1]),
|
||||
('conv5_3_CPM', [512, 128, 3, 1, 1])
|
||||
])
|
||||
|
||||
block1_1 = OrderedDict([
|
||||
('conv6_1_CPM', [128, 512, 1, 1, 0]),
|
||||
('conv6_2_CPM', [512, 22, 1, 1, 0])
|
||||
])
|
||||
|
||||
blocks = {}
|
||||
blocks['block1_0'] = block1_0
|
||||
blocks['block1_1'] = block1_1
|
||||
|
||||
# stage 2-6
|
||||
for i in range(2, 7):
|
||||
blocks['block%d' % i] = OrderedDict([
|
||||
('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
|
||||
('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
|
||||
('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
|
||||
])
|
||||
|
||||
for k in blocks.keys():
|
||||
blocks[k] = make_layers(blocks[k], no_relu_layers)
|
||||
|
||||
self.model1_0 = blocks['block1_0']
|
||||
self.model1_1 = blocks['block1_1']
|
||||
self.model2 = blocks['block2']
|
||||
self.model3 = blocks['block3']
|
||||
self.model4 = blocks['block4']
|
||||
self.model5 = blocks['block5']
|
||||
self.model6 = blocks['block6']
|
||||
|
||||
def forward(self, x):
|
||||
out1_0 = self.model1_0(x)
|
||||
out1_1 = self.model1_1(out1_0)
|
||||
concat_stage2 = torch.cat([out1_1, out1_0], 1)
|
||||
out_stage2 = self.model2(concat_stage2)
|
||||
concat_stage3 = torch.cat([out_stage2, out1_0], 1)
|
||||
out_stage3 = self.model3(concat_stage3)
|
||||
concat_stage4 = torch.cat([out_stage3, out1_0], 1)
|
||||
out_stage4 = self.model4(concat_stage4)
|
||||
concat_stage5 = torch.cat([out_stage4, out1_0], 1)
|
||||
out_stage5 = self.model5(concat_stage5)
|
||||
concat_stage6 = torch.cat([out_stage5, out1_0], 1)
|
||||
out_stage6 = self.model6(concat_stage6)
|
||||
return out_stage6
|
||||
|
||||
|
||||
164
comfy/annotator/openpose/util.py
Normal file
164
comfy/annotator/openpose/util.py
Normal file
@ -0,0 +1,164 @@
|
||||
import math
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
import cv2
|
||||
|
||||
|
||||
def padRightDownCorner(img, stride, padValue):
|
||||
h = img.shape[0]
|
||||
w = img.shape[1]
|
||||
|
||||
pad = 4 * [None]
|
||||
pad[0] = 0 # up
|
||||
pad[1] = 0 # left
|
||||
pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
|
||||
pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
|
||||
|
||||
img_padded = img
|
||||
pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
|
||||
img_padded = np.concatenate((pad_up, img_padded), axis=0)
|
||||
pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
|
||||
img_padded = np.concatenate((pad_left, img_padded), axis=1)
|
||||
pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
|
||||
img_padded = np.concatenate((img_padded, pad_down), axis=0)
|
||||
pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
|
||||
img_padded = np.concatenate((img_padded, pad_right), axis=1)
|
||||
|
||||
return img_padded, pad
|
||||
|
||||
# transfer caffe model to pytorch which will match the layer name
|
||||
def transfer(model, model_weights):
|
||||
transfered_model_weights = {}
|
||||
for weights_name in model.state_dict().keys():
|
||||
transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
|
||||
return transfered_model_weights
|
||||
|
||||
# draw the body keypoint and lims
|
||||
def draw_bodypose(canvas, candidate, subset):
|
||||
stickwidth = 4
|
||||
limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
|
||||
[10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
|
||||
[1, 16], [16, 18], [3, 17], [6, 18]]
|
||||
|
||||
colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
|
||||
[0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
|
||||
[170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
|
||||
for i in range(18):
|
||||
for n in range(len(subset)):
|
||||
index = int(subset[n][i])
|
||||
if index == -1:
|
||||
continue
|
||||
x, y = candidate[index][0:2]
|
||||
cv2.circle(canvas, (int(x), int(y)), 4, colors[i], thickness=-1)
|
||||
for i in range(17):
|
||||
for n in range(len(subset)):
|
||||
index = subset[n][np.array(limbSeq[i]) - 1]
|
||||
if -1 in index:
|
||||
continue
|
||||
cur_canvas = canvas.copy()
|
||||
Y = candidate[index.astype(int), 0]
|
||||
X = candidate[index.astype(int), 1]
|
||||
mX = np.mean(X)
|
||||
mY = np.mean(Y)
|
||||
length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
|
||||
angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
|
||||
polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
|
||||
cv2.fillConvexPoly(cur_canvas, polygon, colors[i])
|
||||
canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
|
||||
# plt.imsave("preview.jpg", canvas[:, :, [2, 1, 0]])
|
||||
# plt.imshow(canvas[:, :, [2, 1, 0]])
|
||||
return canvas
|
||||
|
||||
|
||||
# image drawed by opencv is not good.
|
||||
def draw_handpose(canvas, all_hand_peaks, show_number=False):
|
||||
edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
|
||||
[10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
|
||||
|
||||
for peaks in all_hand_peaks:
|
||||
for ie, e in enumerate(edges):
|
||||
if np.sum(np.all(peaks[e], axis=1)==0)==0:
|
||||
x1, y1 = peaks[e[0]]
|
||||
x2, y2 = peaks[e[1]]
|
||||
cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie/float(len(edges)), 1.0, 1.0])*255, thickness=2)
|
||||
|
||||
for i, keyponit in enumerate(peaks):
|
||||
x, y = keyponit
|
||||
cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
|
||||
if show_number:
|
||||
cv2.putText(canvas, str(i), (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 0), lineType=cv2.LINE_AA)
|
||||
return canvas
|
||||
|
||||
# detect hand according to body pose keypoints
|
||||
# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
|
||||
def handDetect(candidate, subset, oriImg):
|
||||
# right hand: wrist 4, elbow 3, shoulder 2
|
||||
# left hand: wrist 7, elbow 6, shoulder 5
|
||||
ratioWristElbow = 0.33
|
||||
detect_result = []
|
||||
image_height, image_width = oriImg.shape[0:2]
|
||||
for person in subset.astype(int):
|
||||
# if any of three not detected
|
||||
has_left = np.sum(person[[5, 6, 7]] == -1) == 0
|
||||
has_right = np.sum(person[[2, 3, 4]] == -1) == 0
|
||||
if not (has_left or has_right):
|
||||
continue
|
||||
hands = []
|
||||
#left hand
|
||||
if has_left:
|
||||
left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
|
||||
x1, y1 = candidate[left_shoulder_index][:2]
|
||||
x2, y2 = candidate[left_elbow_index][:2]
|
||||
x3, y3 = candidate[left_wrist_index][:2]
|
||||
hands.append([x1, y1, x2, y2, x3, y3, True])
|
||||
# right hand
|
||||
if has_right:
|
||||
right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
|
||||
x1, y1 = candidate[right_shoulder_index][:2]
|
||||
x2, y2 = candidate[right_elbow_index][:2]
|
||||
x3, y3 = candidate[right_wrist_index][:2]
|
||||
hands.append([x1, y1, x2, y2, x3, y3, False])
|
||||
|
||||
for x1, y1, x2, y2, x3, y3, is_left in hands:
|
||||
# pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
|
||||
# handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
|
||||
# handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
|
||||
# const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
|
||||
# const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
|
||||
# handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
|
||||
x = x3 + ratioWristElbow * (x3 - x2)
|
||||
y = y3 + ratioWristElbow * (y3 - y2)
|
||||
distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
|
||||
distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
|
||||
width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
|
||||
# x-y refers to the center --> offset to topLeft point
|
||||
# handRectangle.x -= handRectangle.width / 2.f;
|
||||
# handRectangle.y -= handRectangle.height / 2.f;
|
||||
x -= width / 2
|
||||
y -= width / 2 # width = height
|
||||
# overflow the image
|
||||
if x < 0: x = 0
|
||||
if y < 0: y = 0
|
||||
width1 = width
|
||||
width2 = width
|
||||
if x + width > image_width: width1 = image_width - x
|
||||
if y + width > image_height: width2 = image_height - y
|
||||
width = min(width1, width2)
|
||||
# the max hand box value is 20 pixels
|
||||
if width >= 20:
|
||||
detect_result.append([int(x), int(y), int(width), is_left])
|
||||
|
||||
'''
|
||||
return value: [[x, y, w, True if left hand else False]].
|
||||
width=height since the network require squared input.
|
||||
x, y is the coordinate of top left
|
||||
'''
|
||||
return detect_result
|
||||
|
||||
# get max index of 2d array
|
||||
def npmax(array):
|
||||
arrayindex = array.argmax(1)
|
||||
arrayvalue = array.max(1)
|
||||
i = arrayvalue.argmax()
|
||||
j = arrayindex[i]
|
||||
return i, j
|
||||
23
comfy/annotator/uniformer/__init__.py
Normal file
23
comfy/annotator/uniformer/__init__.py
Normal file
@ -0,0 +1,23 @@
|
||||
import os
|
||||
|
||||
from annotator.uniformer.mmseg.apis import init_segmentor, inference_segmentor, show_result_pyplot
|
||||
from annotator.uniformer.mmseg.core.evaluation import get_palette
|
||||
from annotator.util import annotator_ckpts_path
|
||||
|
||||
|
||||
checkpoint_file = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/upernet_global_small.pth"
|
||||
|
||||
|
||||
class UniformerDetector:
|
||||
def __init__(self):
|
||||
modelpath = os.path.join(annotator_ckpts_path, "upernet_global_small.pth")
|
||||
if not os.path.exists(modelpath):
|
||||
from basicsr.utils.download_util import load_file_from_url
|
||||
load_file_from_url(checkpoint_file, model_dir=annotator_ckpts_path)
|
||||
config_file = os.path.join(os.path.dirname(annotator_ckpts_path), "uniformer", "exp", "upernet_global_small", "config.py")
|
||||
self.model = init_segmentor(config_file, modelpath).cuda()
|
||||
|
||||
def __call__(self, img):
|
||||
result = inference_segmentor(self.model, img)
|
||||
res_img = show_result_pyplot(self.model, img, result, get_palette('ade'), opacity=1)
|
||||
return res_img
|
||||
54
comfy/annotator/uniformer/configs/_base_/datasets/ade20k.py
Normal file
54
comfy/annotator/uniformer/configs/_base_/datasets/ade20k.py
Normal file
@ -0,0 +1,54 @@
|
||||
# dataset settings
|
||||
dataset_type = 'ADE20KDataset'
|
||||
data_root = 'data/ade/ADEChallengeData2016'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
crop_size = (512, 512)
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations', reduce_zero_label=True),
|
||||
dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
|
||||
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
||||
dict(type='RandomFlip', prob=0.5),
|
||||
dict(type='PhotoMetricDistortion'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(2048, 512),
|
||||
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', keep_ratio=True),
|
||||
dict(type='RandomFlip'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
samples_per_gpu=4,
|
||||
workers_per_gpu=4,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='images/training',
|
||||
ann_dir='annotations/training',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='images/validation',
|
||||
ann_dir='annotations/validation',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='images/validation',
|
||||
ann_dir='annotations/validation',
|
||||
pipeline=test_pipeline))
|
||||
@ -0,0 +1,59 @@
|
||||
# dataset settings
|
||||
dataset_type = 'ChaseDB1Dataset'
|
||||
data_root = 'data/CHASE_DB1'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
img_scale = (960, 999)
|
||||
crop_size = (128, 128)
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations'),
|
||||
dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
|
||||
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
||||
dict(type='RandomFlip', prob=0.5),
|
||||
dict(type='PhotoMetricDistortion'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=img_scale,
|
||||
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', keep_ratio=True),
|
||||
dict(type='RandomFlip'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img'])
|
||||
])
|
||||
]
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=4,
|
||||
workers_per_gpu=4,
|
||||
train=dict(
|
||||
type='RepeatDataset',
|
||||
times=40000,
|
||||
dataset=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='images/training',
|
||||
ann_dir='annotations/training',
|
||||
pipeline=train_pipeline)),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='images/validation',
|
||||
ann_dir='annotations/validation',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='images/validation',
|
||||
ann_dir='annotations/validation',
|
||||
pipeline=test_pipeline))
|
||||
@ -0,0 +1,54 @@
|
||||
# dataset settings
|
||||
dataset_type = 'CityscapesDataset'
|
||||
data_root = 'data/cityscapes/'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
crop_size = (512, 1024)
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations'),
|
||||
dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
|
||||
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
||||
dict(type='RandomFlip', prob=0.5),
|
||||
dict(type='PhotoMetricDistortion'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(2048, 1024),
|
||||
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', keep_ratio=True),
|
||||
dict(type='RandomFlip'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
samples_per_gpu=2,
|
||||
workers_per_gpu=2,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='leftImg8bit/train',
|
||||
ann_dir='gtFine/train',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='leftImg8bit/val',
|
||||
ann_dir='gtFine/val',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='leftImg8bit/val',
|
||||
ann_dir='gtFine/val',
|
||||
pipeline=test_pipeline))
|
||||
@ -0,0 +1,35 @@
|
||||
_base_ = './cityscapes.py'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
crop_size = (769, 769)
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations'),
|
||||
dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)),
|
||||
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
||||
dict(type='RandomFlip', prob=0.5),
|
||||
dict(type='PhotoMetricDistortion'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(2049, 1025),
|
||||
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', keep_ratio=True),
|
||||
dict(type='RandomFlip'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
train=dict(pipeline=train_pipeline),
|
||||
val=dict(pipeline=test_pipeline),
|
||||
test=dict(pipeline=test_pipeline))
|
||||
59
comfy/annotator/uniformer/configs/_base_/datasets/drive.py
Normal file
59
comfy/annotator/uniformer/configs/_base_/datasets/drive.py
Normal file
@ -0,0 +1,59 @@
|
||||
# dataset settings
|
||||
dataset_type = 'DRIVEDataset'
|
||||
data_root = 'data/DRIVE'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
img_scale = (584, 565)
|
||||
crop_size = (64, 64)
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations'),
|
||||
dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
|
||||
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
||||
dict(type='RandomFlip', prob=0.5),
|
||||
dict(type='PhotoMetricDistortion'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=img_scale,
|
||||
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', keep_ratio=True),
|
||||
dict(type='RandomFlip'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img'])
|
||||
])
|
||||
]
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=4,
|
||||
workers_per_gpu=4,
|
||||
train=dict(
|
||||
type='RepeatDataset',
|
||||
times=40000,
|
||||
dataset=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='images/training',
|
||||
ann_dir='annotations/training',
|
||||
pipeline=train_pipeline)),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='images/validation',
|
||||
ann_dir='annotations/validation',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='images/validation',
|
||||
ann_dir='annotations/validation',
|
||||
pipeline=test_pipeline))
|
||||
59
comfy/annotator/uniformer/configs/_base_/datasets/hrf.py
Normal file
59
comfy/annotator/uniformer/configs/_base_/datasets/hrf.py
Normal file
@ -0,0 +1,59 @@
|
||||
# dataset settings
|
||||
dataset_type = 'HRFDataset'
|
||||
data_root = 'data/HRF'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
img_scale = (2336, 3504)
|
||||
crop_size = (256, 256)
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations'),
|
||||
dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
|
||||
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
||||
dict(type='RandomFlip', prob=0.5),
|
||||
dict(type='PhotoMetricDistortion'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=img_scale,
|
||||
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', keep_ratio=True),
|
||||
dict(type='RandomFlip'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img'])
|
||||
])
|
||||
]
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=4,
|
||||
workers_per_gpu=4,
|
||||
train=dict(
|
||||
type='RepeatDataset',
|
||||
times=40000,
|
||||
dataset=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='images/training',
|
||||
ann_dir='annotations/training',
|
||||
pipeline=train_pipeline)),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='images/validation',
|
||||
ann_dir='annotations/validation',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='images/validation',
|
||||
ann_dir='annotations/validation',
|
||||
pipeline=test_pipeline))
|
||||
@ -0,0 +1,60 @@
|
||||
# dataset settings
|
||||
dataset_type = 'PascalContextDataset'
|
||||
data_root = 'data/VOCdevkit/VOC2010/'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
|
||||
img_scale = (520, 520)
|
||||
crop_size = (480, 480)
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations'),
|
||||
dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
|
||||
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
||||
dict(type='RandomFlip', prob=0.5),
|
||||
dict(type='PhotoMetricDistortion'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=img_scale,
|
||||
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', keep_ratio=True),
|
||||
dict(type='RandomFlip'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
samples_per_gpu=4,
|
||||
workers_per_gpu=4,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='JPEGImages',
|
||||
ann_dir='SegmentationClassContext',
|
||||
split='ImageSets/SegmentationContext/train.txt',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='JPEGImages',
|
||||
ann_dir='SegmentationClassContext',
|
||||
split='ImageSets/SegmentationContext/val.txt',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='JPEGImages',
|
||||
ann_dir='SegmentationClassContext',
|
||||
split='ImageSets/SegmentationContext/val.txt',
|
||||
pipeline=test_pipeline))
|
||||
@ -0,0 +1,60 @@
|
||||
# dataset settings
|
||||
dataset_type = 'PascalContextDataset59'
|
||||
data_root = 'data/VOCdevkit/VOC2010/'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
|
||||
img_scale = (520, 520)
|
||||
crop_size = (480, 480)
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations', reduce_zero_label=True),
|
||||
dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
|
||||
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
||||
dict(type='RandomFlip', prob=0.5),
|
||||
dict(type='PhotoMetricDistortion'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=img_scale,
|
||||
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', keep_ratio=True),
|
||||
dict(type='RandomFlip'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
samples_per_gpu=4,
|
||||
workers_per_gpu=4,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='JPEGImages',
|
||||
ann_dir='SegmentationClassContext',
|
||||
split='ImageSets/SegmentationContext/train.txt',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='JPEGImages',
|
||||
ann_dir='SegmentationClassContext',
|
||||
split='ImageSets/SegmentationContext/val.txt',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='JPEGImages',
|
||||
ann_dir='SegmentationClassContext',
|
||||
split='ImageSets/SegmentationContext/val.txt',
|
||||
pipeline=test_pipeline))
|
||||
@ -0,0 +1,57 @@
|
||||
# dataset settings
|
||||
dataset_type = 'PascalVOCDataset'
|
||||
data_root = 'data/VOCdevkit/VOC2012'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
crop_size = (512, 512)
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations'),
|
||||
dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
|
||||
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
||||
dict(type='RandomFlip', prob=0.5),
|
||||
dict(type='PhotoMetricDistortion'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(2048, 512),
|
||||
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', keep_ratio=True),
|
||||
dict(type='RandomFlip'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
samples_per_gpu=4,
|
||||
workers_per_gpu=4,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='JPEGImages',
|
||||
ann_dir='SegmentationClass',
|
||||
split='ImageSets/Segmentation/train.txt',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='JPEGImages',
|
||||
ann_dir='SegmentationClass',
|
||||
split='ImageSets/Segmentation/val.txt',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='JPEGImages',
|
||||
ann_dir='SegmentationClass',
|
||||
split='ImageSets/Segmentation/val.txt',
|
||||
pipeline=test_pipeline))
|
||||
@ -0,0 +1,9 @@
|
||||
_base_ = './pascal_voc12.py'
|
||||
# dataset settings
|
||||
data = dict(
|
||||
train=dict(
|
||||
ann_dir=['SegmentationClass', 'SegmentationClassAug'],
|
||||
split=[
|
||||
'ImageSets/Segmentation/train.txt',
|
||||
'ImageSets/Segmentation/aug.txt'
|
||||
]))
|
||||
59
comfy/annotator/uniformer/configs/_base_/datasets/stare.py
Normal file
59
comfy/annotator/uniformer/configs/_base_/datasets/stare.py
Normal file
@ -0,0 +1,59 @@
|
||||
# dataset settings
|
||||
dataset_type = 'STAREDataset'
|
||||
data_root = 'data/STARE'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
img_scale = (605, 700)
|
||||
crop_size = (128, 128)
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations'),
|
||||
dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
|
||||
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
||||
dict(type='RandomFlip', prob=0.5),
|
||||
dict(type='PhotoMetricDistortion'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=img_scale,
|
||||
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', keep_ratio=True),
|
||||
dict(type='RandomFlip'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img'])
|
||||
])
|
||||
]
|
||||
|
||||
data = dict(
|
||||
samples_per_gpu=4,
|
||||
workers_per_gpu=4,
|
||||
train=dict(
|
||||
type='RepeatDataset',
|
||||
times=40000,
|
||||
dataset=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='images/training',
|
||||
ann_dir='annotations/training',
|
||||
pipeline=train_pipeline)),
|
||||
val=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='images/validation',
|
||||
ann_dir='annotations/validation',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type,
|
||||
data_root=data_root,
|
||||
img_dir='images/validation',
|
||||
ann_dir='annotations/validation',
|
||||
pipeline=test_pipeline))
|
||||
14
comfy/annotator/uniformer/configs/_base_/default_runtime.py
Normal file
14
comfy/annotator/uniformer/configs/_base_/default_runtime.py
Normal file
@ -0,0 +1,14 @@
|
||||
# yapf:disable
|
||||
log_config = dict(
|
||||
interval=50,
|
||||
hooks=[
|
||||
dict(type='TextLoggerHook', by_epoch=False),
|
||||
# dict(type='TensorboardLoggerHook')
|
||||
])
|
||||
# yapf:enable
|
||||
dist_params = dict(backend='nccl')
|
||||
log_level = 'INFO'
|
||||
load_from = None
|
||||
resume_from = None
|
||||
workflow = [('train', 1)]
|
||||
cudnn_benchmark = True
|
||||
@ -0,0 +1,46 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='ANNHead',
|
||||
in_channels=[1024, 2048],
|
||||
in_index=[2, 3],
|
||||
channels=512,
|
||||
project_channels=256,
|
||||
query_scales=(1, ),
|
||||
key_pool_scales=(1, 3, 6, 8),
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,44 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='APCHead',
|
||||
in_channels=2048,
|
||||
in_index=3,
|
||||
channels=512,
|
||||
pool_scales=(1, 2, 3, 6),
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,44 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='CCHead',
|
||||
in_channels=2048,
|
||||
in_index=3,
|
||||
channels=512,
|
||||
recurrence=2,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
35
comfy/annotator/uniformer/configs/_base_/models/cgnet.py
Normal file
35
comfy/annotator/uniformer/configs/_base_/models/cgnet.py
Normal file
@ -0,0 +1,35 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
backbone=dict(
|
||||
type='CGNet',
|
||||
norm_cfg=norm_cfg,
|
||||
in_channels=3,
|
||||
num_channels=(32, 64, 128),
|
||||
num_blocks=(3, 21),
|
||||
dilations=(2, 4),
|
||||
reductions=(8, 16)),
|
||||
decode_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=256,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=0,
|
||||
concat_input=False,
|
||||
dropout_ratio=0,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss',
|
||||
use_sigmoid=False,
|
||||
loss_weight=1.0,
|
||||
class_weight=[
|
||||
2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352,
|
||||
10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905,
|
||||
10.347791, 6.3927646, 10.226669, 10.241062, 10.280587,
|
||||
10.396974, 10.055647
|
||||
])),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(sampler=None),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,44 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='DAHead',
|
||||
in_channels=2048,
|
||||
in_index=3,
|
||||
channels=512,
|
||||
pam_channels=64,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,44 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='ASPPHead',
|
||||
in_channels=2048,
|
||||
in_index=3,
|
||||
channels=512,
|
||||
dilations=(1, 12, 24, 36),
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,50 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained=None,
|
||||
backbone=dict(
|
||||
type='UNet',
|
||||
in_channels=3,
|
||||
base_channels=64,
|
||||
num_stages=5,
|
||||
strides=(1, 1, 1, 1, 1),
|
||||
enc_num_convs=(2, 2, 2, 2, 2),
|
||||
dec_num_convs=(2, 2, 2, 2),
|
||||
downsamples=(True, True, True, True),
|
||||
enc_dilations=(1, 1, 1, 1, 1),
|
||||
dec_dilations=(1, 1, 1, 1),
|
||||
with_cp=False,
|
||||
conv_cfg=None,
|
||||
norm_cfg=norm_cfg,
|
||||
act_cfg=dict(type='ReLU'),
|
||||
upsample_cfg=dict(type='InterpConv'),
|
||||
norm_eval=False),
|
||||
decode_head=dict(
|
||||
type='ASPPHead',
|
||||
in_channels=64,
|
||||
in_index=4,
|
||||
channels=16,
|
||||
dilations=(1, 12, 24, 36),
|
||||
dropout_ratio=0.1,
|
||||
num_classes=2,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=128,
|
||||
in_index=3,
|
||||
channels=64,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=2,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='slide', crop_size=256, stride=170))
|
||||
@ -0,0 +1,46 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='DepthwiseSeparableASPPHead',
|
||||
in_channels=2048,
|
||||
in_index=3,
|
||||
channels=512,
|
||||
dilations=(1, 12, 24, 36),
|
||||
c1_in_channels=256,
|
||||
c1_channels=48,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,44 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='DMHead',
|
||||
in_channels=2048,
|
||||
in_index=3,
|
||||
channels=512,
|
||||
filter_sizes=(1, 3, 5, 7),
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,46 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='DNLHead',
|
||||
in_channels=2048,
|
||||
in_index=3,
|
||||
channels=512,
|
||||
dropout_ratio=0.1,
|
||||
reduction=2,
|
||||
use_scale=True,
|
||||
mode='embedded_gaussian',
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,47 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='EMAHead',
|
||||
in_channels=2048,
|
||||
in_index=3,
|
||||
channels=256,
|
||||
ema_channels=512,
|
||||
num_bases=64,
|
||||
num_stages=3,
|
||||
momentum=0.1,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,48 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='EncHead',
|
||||
in_channels=[512, 1024, 2048],
|
||||
in_index=(1, 2, 3),
|
||||
channels=512,
|
||||
num_codes=32,
|
||||
use_se_loss=True,
|
||||
add_lateral=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
||||
loss_se_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
57
comfy/annotator/uniformer/configs/_base_/models/fast_scnn.py
Normal file
57
comfy/annotator/uniformer/configs/_base_/models/fast_scnn.py
Normal file
@ -0,0 +1,57 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
backbone=dict(
|
||||
type='FastSCNN',
|
||||
downsample_dw_channels=(32, 48),
|
||||
global_in_channels=64,
|
||||
global_block_channels=(64, 96, 128),
|
||||
global_block_strides=(2, 2, 1),
|
||||
global_out_channels=128,
|
||||
higher_in_channels=64,
|
||||
lower_in_channels=128,
|
||||
fusion_out_channels=128,
|
||||
out_indices=(0, 1, 2),
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False),
|
||||
decode_head=dict(
|
||||
type='DepthwiseSeparableFCNHead',
|
||||
in_channels=128,
|
||||
channels=128,
|
||||
concat_input=False,
|
||||
num_classes=19,
|
||||
in_index=-1,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
|
||||
auxiliary_head=[
|
||||
dict(
|
||||
type='FCNHead',
|
||||
in_channels=128,
|
||||
channels=32,
|
||||
num_convs=1,
|
||||
num_classes=19,
|
||||
in_index=-2,
|
||||
norm_cfg=norm_cfg,
|
||||
concat_input=False,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
|
||||
dict(
|
||||
type='FCNHead',
|
||||
in_channels=64,
|
||||
channels=32,
|
||||
num_convs=1,
|
||||
num_classes=19,
|
||||
in_index=-3,
|
||||
norm_cfg=norm_cfg,
|
||||
concat_input=False,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
|
||||
],
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
52
comfy/annotator/uniformer/configs/_base_/models/fcn_hr18.py
Normal file
52
comfy/annotator/uniformer/configs/_base_/models/fcn_hr18.py
Normal file
@ -0,0 +1,52 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://msra/hrnetv2_w18',
|
||||
backbone=dict(
|
||||
type='HRNet',
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
extra=dict(
|
||||
stage1=dict(
|
||||
num_modules=1,
|
||||
num_branches=1,
|
||||
block='BOTTLENECK',
|
||||
num_blocks=(4, ),
|
||||
num_channels=(64, )),
|
||||
stage2=dict(
|
||||
num_modules=1,
|
||||
num_branches=2,
|
||||
block='BASIC',
|
||||
num_blocks=(4, 4),
|
||||
num_channels=(18, 36)),
|
||||
stage3=dict(
|
||||
num_modules=4,
|
||||
num_branches=3,
|
||||
block='BASIC',
|
||||
num_blocks=(4, 4, 4),
|
||||
num_channels=(18, 36, 72)),
|
||||
stage4=dict(
|
||||
num_modules=3,
|
||||
num_branches=4,
|
||||
block='BASIC',
|
||||
num_blocks=(4, 4, 4, 4),
|
||||
num_channels=(18, 36, 72, 144)))),
|
||||
decode_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=[18, 36, 72, 144],
|
||||
in_index=(0, 1, 2, 3),
|
||||
channels=sum([18, 36, 72, 144]),
|
||||
input_transform='resize_concat',
|
||||
kernel_size=1,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=-1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,45 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=2048,
|
||||
in_index=3,
|
||||
channels=512,
|
||||
num_convs=2,
|
||||
concat_input=True,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,51 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained=None,
|
||||
backbone=dict(
|
||||
type='UNet',
|
||||
in_channels=3,
|
||||
base_channels=64,
|
||||
num_stages=5,
|
||||
strides=(1, 1, 1, 1, 1),
|
||||
enc_num_convs=(2, 2, 2, 2, 2),
|
||||
dec_num_convs=(2, 2, 2, 2),
|
||||
downsamples=(True, True, True, True),
|
||||
enc_dilations=(1, 1, 1, 1, 1),
|
||||
dec_dilations=(1, 1, 1, 1),
|
||||
with_cp=False,
|
||||
conv_cfg=None,
|
||||
norm_cfg=norm_cfg,
|
||||
act_cfg=dict(type='ReLU'),
|
||||
upsample_cfg=dict(type='InterpConv'),
|
||||
norm_eval=False),
|
||||
decode_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=64,
|
||||
in_index=4,
|
||||
channels=64,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=2,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=128,
|
||||
in_index=3,
|
||||
channels=64,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=2,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='slide', crop_size=256, stride=170))
|
||||
36
comfy/annotator/uniformer/configs/_base_/models/fpn_r50.py
Normal file
36
comfy/annotator/uniformer/configs/_base_/models/fpn_r50.py
Normal file
@ -0,0 +1,36 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 1, 1),
|
||||
strides=(1, 2, 2, 2),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
neck=dict(
|
||||
type='FPN',
|
||||
in_channels=[256, 512, 1024, 2048],
|
||||
out_channels=256,
|
||||
num_outs=4),
|
||||
decode_head=dict(
|
||||
type='FPNHead',
|
||||
in_channels=[256, 256, 256, 256],
|
||||
in_index=[0, 1, 2, 3],
|
||||
feature_strides=[4, 8, 16, 32],
|
||||
channels=128,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,35 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
backbone=dict(
|
||||
type='UniFormer',
|
||||
embed_dim=[64, 128, 320, 512],
|
||||
layers=[3, 4, 8, 3],
|
||||
head_dim=64,
|
||||
mlp_ratio=4.,
|
||||
qkv_bias=True,
|
||||
drop_rate=0.,
|
||||
attn_drop_rate=0.,
|
||||
drop_path_rate=0.1),
|
||||
neck=dict(
|
||||
type='FPN',
|
||||
in_channels=[64, 128, 320, 512],
|
||||
out_channels=256,
|
||||
num_outs=4),
|
||||
decode_head=dict(
|
||||
type='FPNHead',
|
||||
in_channels=[256, 256, 256, 256],
|
||||
in_index=[0, 1, 2, 3],
|
||||
feature_strides=[4, 8, 16, 32],
|
||||
channels=128,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=150,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole')
|
||||
)
|
||||
@ -0,0 +1,46 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='GCHead',
|
||||
in_channels=2048,
|
||||
in_index=3,
|
||||
channels=512,
|
||||
ratio=1 / 4.,
|
||||
pooling_type='att',
|
||||
fusion_types=('channel_add', ),
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,25 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
backbone=dict(
|
||||
type='MobileNetV3',
|
||||
arch='large',
|
||||
out_indices=(1, 3, 16),
|
||||
norm_cfg=norm_cfg),
|
||||
decode_head=dict(
|
||||
type='LRASPPHead',
|
||||
in_channels=(16, 24, 960),
|
||||
in_index=(0, 1, 2),
|
||||
channels=128,
|
||||
input_transform='multiple_select',
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
act_cfg=dict(type='ReLU'),
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,46 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='NLHead',
|
||||
in_channels=2048,
|
||||
in_index=3,
|
||||
channels=512,
|
||||
dropout_ratio=0.1,
|
||||
reduction=2,
|
||||
use_scale=True,
|
||||
mode='embedded_gaussian',
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,68 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='CascadeEncoderDecoder',
|
||||
num_stages=2,
|
||||
pretrained='open-mmlab://msra/hrnetv2_w18',
|
||||
backbone=dict(
|
||||
type='HRNet',
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
extra=dict(
|
||||
stage1=dict(
|
||||
num_modules=1,
|
||||
num_branches=1,
|
||||
block='BOTTLENECK',
|
||||
num_blocks=(4, ),
|
||||
num_channels=(64, )),
|
||||
stage2=dict(
|
||||
num_modules=1,
|
||||
num_branches=2,
|
||||
block='BASIC',
|
||||
num_blocks=(4, 4),
|
||||
num_channels=(18, 36)),
|
||||
stage3=dict(
|
||||
num_modules=4,
|
||||
num_branches=3,
|
||||
block='BASIC',
|
||||
num_blocks=(4, 4, 4),
|
||||
num_channels=(18, 36, 72)),
|
||||
stage4=dict(
|
||||
num_modules=3,
|
||||
num_branches=4,
|
||||
block='BASIC',
|
||||
num_blocks=(4, 4, 4, 4),
|
||||
num_channels=(18, 36, 72, 144)))),
|
||||
decode_head=[
|
||||
dict(
|
||||
type='FCNHead',
|
||||
in_channels=[18, 36, 72, 144],
|
||||
channels=sum([18, 36, 72, 144]),
|
||||
in_index=(0, 1, 2, 3),
|
||||
input_transform='resize_concat',
|
||||
kernel_size=1,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=-1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
dict(
|
||||
type='OCRHead',
|
||||
in_channels=[18, 36, 72, 144],
|
||||
in_index=(0, 1, 2, 3),
|
||||
input_transform='resize_concat',
|
||||
channels=512,
|
||||
ocr_channels=256,
|
||||
dropout_ratio=-1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
],
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,47 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='CascadeEncoderDecoder',
|
||||
num_stages=2,
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=[
|
||||
dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
dict(
|
||||
type='OCRHead',
|
||||
in_channels=2048,
|
||||
in_index=3,
|
||||
channels=512,
|
||||
ocr_channels=256,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
|
||||
],
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,56 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='CascadeEncoderDecoder',
|
||||
num_stages=2,
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 1, 1),
|
||||
strides=(1, 2, 2, 2),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
neck=dict(
|
||||
type='FPN',
|
||||
in_channels=[256, 512, 1024, 2048],
|
||||
out_channels=256,
|
||||
num_outs=4),
|
||||
decode_head=[
|
||||
dict(
|
||||
type='FPNHead',
|
||||
in_channels=[256, 256, 256, 256],
|
||||
in_index=[0, 1, 2, 3],
|
||||
feature_strides=[4, 8, 16, 32],
|
||||
channels=128,
|
||||
dropout_ratio=-1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
dict(
|
||||
type='PointHead',
|
||||
in_channels=[256],
|
||||
in_index=[0],
|
||||
channels=256,
|
||||
num_fcs=3,
|
||||
coarse_pred_each_layer=True,
|
||||
dropout_ratio=-1,
|
||||
num_classes=19,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
|
||||
],
|
||||
# model training and testing settings
|
||||
train_cfg=dict(
|
||||
num_points=2048, oversample_ratio=3, importance_sample_ratio=0.75),
|
||||
test_cfg=dict(
|
||||
mode='whole',
|
||||
subdivision_steps=2,
|
||||
subdivision_num_points=8196,
|
||||
scale_factor=2))
|
||||
@ -0,0 +1,49 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='PSAHead',
|
||||
in_channels=2048,
|
||||
in_index=3,
|
||||
channels=512,
|
||||
mask_size=(97, 97),
|
||||
psa_type='bi-direction',
|
||||
compact=False,
|
||||
shrink_factor=2,
|
||||
normalization_factor=1.0,
|
||||
psa_softmax=True,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,44 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 2, 4),
|
||||
strides=(1, 2, 1, 1),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='PSPHead',
|
||||
in_channels=2048,
|
||||
in_index=3,
|
||||
channels=512,
|
||||
pool_scales=(1, 2, 3, 6),
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,50 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained=None,
|
||||
backbone=dict(
|
||||
type='UNet',
|
||||
in_channels=3,
|
||||
base_channels=64,
|
||||
num_stages=5,
|
||||
strides=(1, 1, 1, 1, 1),
|
||||
enc_num_convs=(2, 2, 2, 2, 2),
|
||||
dec_num_convs=(2, 2, 2, 2),
|
||||
downsamples=(True, True, True, True),
|
||||
enc_dilations=(1, 1, 1, 1, 1),
|
||||
dec_dilations=(1, 1, 1, 1),
|
||||
with_cp=False,
|
||||
conv_cfg=None,
|
||||
norm_cfg=norm_cfg,
|
||||
act_cfg=dict(type='ReLU'),
|
||||
upsample_cfg=dict(type='InterpConv'),
|
||||
norm_eval=False),
|
||||
decode_head=dict(
|
||||
type='PSPHead',
|
||||
in_channels=64,
|
||||
in_index=4,
|
||||
channels=16,
|
||||
pool_scales=(1, 2, 3, 6),
|
||||
dropout_ratio=0.1,
|
||||
num_classes=2,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=128,
|
||||
in_index=3,
|
||||
channels=64,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=2,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='slide', crop_size=256, stride=170))
|
||||
@ -0,0 +1,44 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained='open-mmlab://resnet50_v1c',
|
||||
backbone=dict(
|
||||
type='ResNetV1c',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
dilations=(1, 1, 1, 1),
|
||||
strides=(1, 2, 2, 2),
|
||||
norm_cfg=norm_cfg,
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
contract_dilation=True),
|
||||
decode_head=dict(
|
||||
type='UPerHead',
|
||||
in_channels=[256, 512, 1024, 2048],
|
||||
in_index=[0, 1, 2, 3],
|
||||
pool_scales=(1, 2, 3, 6),
|
||||
channels=512,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=1024,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,43 @@
|
||||
# model settings
|
||||
norm_cfg = dict(type='BN', requires_grad=True)
|
||||
model = dict(
|
||||
type='EncoderDecoder',
|
||||
pretrained=None,
|
||||
backbone=dict(
|
||||
type='UniFormer',
|
||||
embed_dim=[64, 128, 320, 512],
|
||||
layers=[3, 4, 8, 3],
|
||||
head_dim=64,
|
||||
mlp_ratio=4.,
|
||||
qkv_bias=True,
|
||||
drop_rate=0.,
|
||||
attn_drop_rate=0.,
|
||||
drop_path_rate=0.1),
|
||||
decode_head=dict(
|
||||
type='UPerHead',
|
||||
in_channels=[64, 128, 320, 512],
|
||||
in_index=[0, 1, 2, 3],
|
||||
pool_scales=(1, 2, 3, 6),
|
||||
channels=512,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||
auxiliary_head=dict(
|
||||
type='FCNHead',
|
||||
in_channels=320,
|
||||
in_index=2,
|
||||
channels=256,
|
||||
num_convs=1,
|
||||
concat_input=False,
|
||||
dropout_ratio=0.1,
|
||||
num_classes=19,
|
||||
norm_cfg=norm_cfg,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
# model training and testing settings
|
||||
train_cfg=dict(),
|
||||
test_cfg=dict(mode='whole'))
|
||||
@ -0,0 +1,9 @@
|
||||
# optimizer
|
||||
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
|
||||
optimizer_config = dict()
|
||||
# learning policy
|
||||
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
|
||||
# runtime settings
|
||||
runner = dict(type='IterBasedRunner', max_iters=160000)
|
||||
checkpoint_config = dict(by_epoch=False, interval=16000)
|
||||
evaluation = dict(interval=16000, metric='mIoU')
|
||||
@ -0,0 +1,9 @@
|
||||
# optimizer
|
||||
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
|
||||
optimizer_config = dict()
|
||||
# learning policy
|
||||
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
|
||||
# runtime settings
|
||||
runner = dict(type='IterBasedRunner', max_iters=20000)
|
||||
checkpoint_config = dict(by_epoch=False, interval=2000)
|
||||
evaluation = dict(interval=2000, metric='mIoU')
|
||||
@ -0,0 +1,9 @@
|
||||
# optimizer
|
||||
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
|
||||
optimizer_config = dict()
|
||||
# learning policy
|
||||
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
|
||||
# runtime settings
|
||||
runner = dict(type='IterBasedRunner', max_iters=40000)
|
||||
checkpoint_config = dict(by_epoch=False, interval=4000)
|
||||
evaluation = dict(interval=4000, metric='mIoU')
|
||||
@ -0,0 +1,9 @@
|
||||
# optimizer
|
||||
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
|
||||
optimizer_config = dict()
|
||||
# learning policy
|
||||
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
|
||||
# runtime settings
|
||||
runner = dict(type='IterBasedRunner', max_iters=80000)
|
||||
checkpoint_config = dict(by_epoch=False, interval=8000)
|
||||
evaluation = dict(interval=8000, metric='mIoU')
|
||||
38
comfy/annotator/uniformer/exp/upernet_global_small/config.py
Normal file
38
comfy/annotator/uniformer/exp/upernet_global_small/config.py
Normal file
@ -0,0 +1,38 @@
|
||||
_base_ = [
|
||||
'../../configs/_base_/models/upernet_uniformer.py',
|
||||
'../../configs/_base_/datasets/ade20k.py',
|
||||
'../../configs/_base_/default_runtime.py',
|
||||
'../../configs/_base_/schedules/schedule_160k.py'
|
||||
]
|
||||
model = dict(
|
||||
backbone=dict(
|
||||
type='UniFormer',
|
||||
embed_dim=[64, 128, 320, 512],
|
||||
layers=[3, 4, 8, 3],
|
||||
head_dim=64,
|
||||
drop_path_rate=0.25,
|
||||
windows=False,
|
||||
hybrid=False
|
||||
),
|
||||
decode_head=dict(
|
||||
in_channels=[64, 128, 320, 512],
|
||||
num_classes=150
|
||||
),
|
||||
auxiliary_head=dict(
|
||||
in_channels=320,
|
||||
num_classes=150
|
||||
))
|
||||
|
||||
# AdamW optimizer, no weight decay for position embedding & layer norm in backbone
|
||||
optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01,
|
||||
paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
|
||||
'relative_position_bias_table': dict(decay_mult=0.),
|
||||
'norm': dict(decay_mult=0.)}))
|
||||
|
||||
lr_config = dict(_delete_=True, policy='poly',
|
||||
warmup='linear',
|
||||
warmup_iters=1500,
|
||||
warmup_ratio=1e-6,
|
||||
power=1.0, min_lr=0.0, by_epoch=False)
|
||||
|
||||
data=dict(samples_per_gpu=2)
|
||||
10
comfy/annotator/uniformer/exp/upernet_global_small/run.sh
Normal file
10
comfy/annotator/uniformer/exp/upernet_global_small/run.sh
Normal file
@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
work_path=$(dirname $0)
|
||||
PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \
|
||||
python -m torch.distributed.launch --nproc_per_node=8 \
|
||||
tools/train.py ${work_path}/config.py \
|
||||
--launcher pytorch \
|
||||
--options model.backbone.pretrained_path='your_model_path/uniformer_small_in1k.pth' \
|
||||
--work-dir ${work_path}/ckpt \
|
||||
2>&1 | tee -a ${work_path}/log.txt
|
||||
10
comfy/annotator/uniformer/exp/upernet_global_small/test.sh
Normal file
10
comfy/annotator/uniformer/exp/upernet_global_small/test.sh
Normal file
@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
work_path=$(dirname $0)
|
||||
PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \
|
||||
python -m torch.distributed.launch --nproc_per_node=8 \
|
||||
tools/test.py ${work_path}/test_config_h32.py \
|
||||
${work_path}/ckpt/latest.pth \
|
||||
--launcher pytorch \
|
||||
--eval mIoU \
|
||||
2>&1 | tee -a ${work_path}/log.txt
|
||||
@ -0,0 +1,38 @@
|
||||
_base_ = [
|
||||
'../../configs/_base_/models/upernet_uniformer.py',
|
||||
'../../configs/_base_/datasets/ade20k.py',
|
||||
'../../configs/_base_/default_runtime.py',
|
||||
'../../configs/_base_/schedules/schedule_160k.py'
|
||||
]
|
||||
model = dict(
|
||||
backbone=dict(
|
||||
type='UniFormer',
|
||||
embed_dim=[64, 128, 320, 512],
|
||||
layers=[3, 4, 8, 3],
|
||||
head_dim=64,
|
||||
drop_path_rate=0.25,
|
||||
windows=False,
|
||||
hybrid=False,
|
||||
),
|
||||
decode_head=dict(
|
||||
in_channels=[64, 128, 320, 512],
|
||||
num_classes=150
|
||||
),
|
||||
auxiliary_head=dict(
|
||||
in_channels=320,
|
||||
num_classes=150
|
||||
))
|
||||
|
||||
# AdamW optimizer, no weight decay for position embedding & layer norm in backbone
|
||||
optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01,
|
||||
paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
|
||||
'relative_position_bias_table': dict(decay_mult=0.),
|
||||
'norm': dict(decay_mult=0.)}))
|
||||
|
||||
lr_config = dict(_delete_=True, policy='poly',
|
||||
warmup='linear',
|
||||
warmup_iters=1500,
|
||||
warmup_ratio=1e-6,
|
||||
power=1.0, min_lr=0.0, by_epoch=False)
|
||||
|
||||
data=dict(samples_per_gpu=2)
|
||||
@ -0,0 +1,39 @@
|
||||
_base_ = [
|
||||
'../../configs/_base_/models/upernet_uniformer.py',
|
||||
'../../configs/_base_/datasets/ade20k.py',
|
||||
'../../configs/_base_/default_runtime.py',
|
||||
'../../configs/_base_/schedules/schedule_160k.py'
|
||||
]
|
||||
model = dict(
|
||||
backbone=dict(
|
||||
type='UniFormer',
|
||||
embed_dim=[64, 128, 320, 512],
|
||||
layers=[3, 4, 8, 3],
|
||||
head_dim=64,
|
||||
drop_path_rate=0.25,
|
||||
windows=False,
|
||||
hybrid=True,
|
||||
window_size=32
|
||||
),
|
||||
decode_head=dict(
|
||||
in_channels=[64, 128, 320, 512],
|
||||
num_classes=150
|
||||
),
|
||||
auxiliary_head=dict(
|
||||
in_channels=320,
|
||||
num_classes=150
|
||||
))
|
||||
|
||||
# AdamW optimizer, no weight decay for position embedding & layer norm in backbone
|
||||
optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01,
|
||||
paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
|
||||
'relative_position_bias_table': dict(decay_mult=0.),
|
||||
'norm': dict(decay_mult=0.)}))
|
||||
|
||||
lr_config = dict(_delete_=True, policy='poly',
|
||||
warmup='linear',
|
||||
warmup_iters=1500,
|
||||
warmup_ratio=1e-6,
|
||||
power=1.0, min_lr=0.0, by_epoch=False)
|
||||
|
||||
data=dict(samples_per_gpu=2)
|
||||
@ -0,0 +1,39 @@
|
||||
_base_ = [
|
||||
'../../configs/_base_/models/upernet_uniformer.py',
|
||||
'../../configs/_base_/datasets/ade20k.py',
|
||||
'../../configs/_base_/default_runtime.py',
|
||||
'../../configs/_base_/schedules/schedule_160k.py'
|
||||
]
|
||||
model = dict(
|
||||
backbone=dict(
|
||||
type='UniFormer',
|
||||
embed_dim=[64, 128, 320, 512],
|
||||
layers=[3, 4, 8, 3],
|
||||
head_dim=64,
|
||||
drop_path_rate=0.25,
|
||||
windows=True,
|
||||
hybrid=False,
|
||||
window_size=32
|
||||
),
|
||||
decode_head=dict(
|
||||
in_channels=[64, 128, 320, 512],
|
||||
num_classes=150
|
||||
),
|
||||
auxiliary_head=dict(
|
||||
in_channels=320,
|
||||
num_classes=150
|
||||
))
|
||||
|
||||
# AdamW optimizer, no weight decay for position embedding & layer norm in backbone
|
||||
optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01,
|
||||
paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
|
||||
'relative_position_bias_table': dict(decay_mult=0.),
|
||||
'norm': dict(decay_mult=0.)}))
|
||||
|
||||
lr_config = dict(_delete_=True, policy='poly',
|
||||
warmup='linear',
|
||||
warmup_iters=1500,
|
||||
warmup_ratio=1e-6,
|
||||
power=1.0, min_lr=0.0, by_epoch=False)
|
||||
|
||||
data=dict(samples_per_gpu=2)
|
||||
15
comfy/annotator/uniformer/mmcv/__init__.py
Normal file
15
comfy/annotator/uniformer/mmcv/__init__.py
Normal file
@ -0,0 +1,15 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
# flake8: noqa
|
||||
from .arraymisc import *
|
||||
from .fileio import *
|
||||
from .image import *
|
||||
from .utils import *
|
||||
from .version import *
|
||||
from .video import *
|
||||
from .visualization import *
|
||||
|
||||
# The following modules are not imported to this level, so mmcv may be used
|
||||
# without PyTorch.
|
||||
# - runner
|
||||
# - parallel
|
||||
# - op
|
||||
4
comfy/annotator/uniformer/mmcv/arraymisc/__init__.py
Normal file
4
comfy/annotator/uniformer/mmcv/arraymisc/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
from .quantization import dequantize, quantize
|
||||
|
||||
__all__ = ['quantize', 'dequantize']
|
||||
55
comfy/annotator/uniformer/mmcv/arraymisc/quantization.py
Normal file
55
comfy/annotator/uniformer/mmcv/arraymisc/quantization.py
Normal file
@ -0,0 +1,55 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import numpy as np
|
||||
|
||||
|
||||
def quantize(arr, min_val, max_val, levels, dtype=np.int64):
|
||||
"""Quantize an array of (-inf, inf) to [0, levels-1].
|
||||
|
||||
Args:
|
||||
arr (ndarray): Input array.
|
||||
min_val (scalar): Minimum value to be clipped.
|
||||
max_val (scalar): Maximum value to be clipped.
|
||||
levels (int): Quantization levels.
|
||||
dtype (np.type): The type of the quantized array.
|
||||
|
||||
Returns:
|
||||
tuple: Quantized array.
|
||||
"""
|
||||
if not (isinstance(levels, int) and levels > 1):
|
||||
raise ValueError(
|
||||
f'levels must be a positive integer, but got {levels}')
|
||||
if min_val >= max_val:
|
||||
raise ValueError(
|
||||
f'min_val ({min_val}) must be smaller than max_val ({max_val})')
|
||||
|
||||
arr = np.clip(arr, min_val, max_val) - min_val
|
||||
quantized_arr = np.minimum(
|
||||
np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1)
|
||||
|
||||
return quantized_arr
|
||||
|
||||
|
||||
def dequantize(arr, min_val, max_val, levels, dtype=np.float64):
|
||||
"""Dequantize an array.
|
||||
|
||||
Args:
|
||||
arr (ndarray): Input array.
|
||||
min_val (scalar): Minimum value to be clipped.
|
||||
max_val (scalar): Maximum value to be clipped.
|
||||
levels (int): Quantization levels.
|
||||
dtype (np.type): The type of the dequantized array.
|
||||
|
||||
Returns:
|
||||
tuple: Dequantized array.
|
||||
"""
|
||||
if not (isinstance(levels, int) and levels > 1):
|
||||
raise ValueError(
|
||||
f'levels must be a positive integer, but got {levels}')
|
||||
if min_val >= max_val:
|
||||
raise ValueError(
|
||||
f'min_val ({min_val}) must be smaller than max_val ({max_val})')
|
||||
|
||||
dequantized_arr = (arr + 0.5).astype(dtype) * (max_val -
|
||||
min_val) / levels + min_val
|
||||
|
||||
return dequantized_arr
|
||||
41
comfy/annotator/uniformer/mmcv/cnn/__init__.py
Normal file
41
comfy/annotator/uniformer/mmcv/cnn/__init__.py
Normal file
@ -0,0 +1,41 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
from .alexnet import AlexNet
|
||||
# yapf: disable
|
||||
from .bricks import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS,
|
||||
PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS,
|
||||
ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule,
|
||||
ConvTranspose2d, ConvTranspose3d, ConvWS2d,
|
||||
DepthwiseSeparableConvModule, GeneralizedAttention,
|
||||
HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d,
|
||||
NonLocal1d, NonLocal2d, NonLocal3d, Scale, Swish,
|
||||
build_activation_layer, build_conv_layer,
|
||||
build_norm_layer, build_padding_layer, build_plugin_layer,
|
||||
build_upsample_layer, conv_ws_2d, is_norm)
|
||||
from .builder import MODELS, build_model_from_cfg
|
||||
# yapf: enable
|
||||
from .resnet import ResNet, make_res_layer
|
||||
from .utils import (INITIALIZERS, Caffe2XavierInit, ConstantInit, KaimingInit,
|
||||
NormalInit, PretrainedInit, TruncNormalInit, UniformInit,
|
||||
XavierInit, bias_init_with_prob, caffe2_xavier_init,
|
||||
constant_init, fuse_conv_bn, get_model_complexity_info,
|
||||
initialize, kaiming_init, normal_init, trunc_normal_init,
|
||||
uniform_init, xavier_init)
|
||||
from .vgg import VGG, make_vgg_layer
|
||||
|
||||
__all__ = [
|
||||
'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer',
|
||||
'constant_init', 'xavier_init', 'normal_init', 'trunc_normal_init',
|
||||
'uniform_init', 'kaiming_init', 'caffe2_xavier_init',
|
||||
'bias_init_with_prob', 'ConvModule', 'build_activation_layer',
|
||||
'build_conv_layer', 'build_norm_layer', 'build_padding_layer',
|
||||
'build_upsample_layer', 'build_plugin_layer', 'is_norm', 'NonLocal1d',
|
||||
'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'HSigmoid', 'Swish', 'HSwish',
|
||||
'GeneralizedAttention', 'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS',
|
||||
'PADDING_LAYERS', 'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale',
|
||||
'get_model_complexity_info', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d',
|
||||
'fuse_conv_bn', 'DepthwiseSeparableConvModule', 'Linear', 'Conv2d',
|
||||
'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d',
|
||||
'initialize', 'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit',
|
||||
'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit',
|
||||
'Caffe2XavierInit', 'MODELS', 'build_model_from_cfg'
|
||||
]
|
||||
61
comfy/annotator/uniformer/mmcv/cnn/alexnet.py
Normal file
61
comfy/annotator/uniformer/mmcv/cnn/alexnet.py
Normal file
@ -0,0 +1,61 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import logging
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class AlexNet(nn.Module):
|
||||
"""AlexNet backbone.
|
||||
|
||||
Args:
|
||||
num_classes (int): number of classes for classification.
|
||||
"""
|
||||
|
||||
def __init__(self, num_classes=-1):
|
||||
super(AlexNet, self).__init__()
|
||||
self.num_classes = num_classes
|
||||
self.features = nn.Sequential(
|
||||
nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.MaxPool2d(kernel_size=3, stride=2),
|
||||
nn.Conv2d(64, 192, kernel_size=5, padding=2),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.MaxPool2d(kernel_size=3, stride=2),
|
||||
nn.Conv2d(192, 384, kernel_size=3, padding=1),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Conv2d(384, 256, kernel_size=3, padding=1),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Conv2d(256, 256, kernel_size=3, padding=1),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.MaxPool2d(kernel_size=3, stride=2),
|
||||
)
|
||||
if self.num_classes > 0:
|
||||
self.classifier = nn.Sequential(
|
||||
nn.Dropout(),
|
||||
nn.Linear(256 * 6 * 6, 4096),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Dropout(),
|
||||
nn.Linear(4096, 4096),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Linear(4096, num_classes),
|
||||
)
|
||||
|
||||
def init_weights(self, pretrained=None):
|
||||
if isinstance(pretrained, str):
|
||||
logger = logging.getLogger()
|
||||
from ..runner import load_checkpoint
|
||||
load_checkpoint(self, pretrained, strict=False, logger=logger)
|
||||
elif pretrained is None:
|
||||
# use default initializer
|
||||
pass
|
||||
else:
|
||||
raise TypeError('pretrained must be a str or None')
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
x = self.features(x)
|
||||
if self.num_classes > 0:
|
||||
x = x.view(x.size(0), 256 * 6 * 6)
|
||||
x = self.classifier(x)
|
||||
|
||||
return x
|
||||
35
comfy/annotator/uniformer/mmcv/cnn/bricks/__init__.py
Normal file
35
comfy/annotator/uniformer/mmcv/cnn/bricks/__init__.py
Normal file
@ -0,0 +1,35 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
from .activation import build_activation_layer
|
||||
from .context_block import ContextBlock
|
||||
from .conv import build_conv_layer
|
||||
from .conv2d_adaptive_padding import Conv2dAdaptivePadding
|
||||
from .conv_module import ConvModule
|
||||
from .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d
|
||||
from .depthwise_separable_conv_module import DepthwiseSeparableConvModule
|
||||
from .drop import Dropout, DropPath
|
||||
from .generalized_attention import GeneralizedAttention
|
||||
from .hsigmoid import HSigmoid
|
||||
from .hswish import HSwish
|
||||
from .non_local import NonLocal1d, NonLocal2d, NonLocal3d
|
||||
from .norm import build_norm_layer, is_norm
|
||||
from .padding import build_padding_layer
|
||||
from .plugin import build_plugin_layer
|
||||
from .registry import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS,
|
||||
PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS)
|
||||
from .scale import Scale
|
||||
from .swish import Swish
|
||||
from .upsample import build_upsample_layer
|
||||
from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,
|
||||
Linear, MaxPool2d, MaxPool3d)
|
||||
|
||||
__all__ = [
|
||||
'ConvModule', 'build_activation_layer', 'build_conv_layer',
|
||||
'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',
|
||||
'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d',
|
||||
'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention',
|
||||
'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS', 'PADDING_LAYERS',
|
||||
'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale', 'ConvAWS2d', 'ConvWS2d',
|
||||
'conv_ws_2d', 'DepthwiseSeparableConvModule', 'Swish', 'Linear',
|
||||
'Conv2dAdaptivePadding', 'Conv2d', 'ConvTranspose2d', 'MaxPool2d',
|
||||
'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'Dropout', 'DropPath'
|
||||
]
|
||||
92
comfy/annotator/uniformer/mmcv/cnn/bricks/activation.py
Normal file
92
comfy/annotator/uniformer/mmcv/cnn/bricks/activation.py
Normal file
@ -0,0 +1,92 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from annotator.uniformer.mmcv.utils import TORCH_VERSION, build_from_cfg, digit_version
|
||||
from .registry import ACTIVATION_LAYERS
|
||||
|
||||
for module in [
|
||||
nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU,
|
||||
nn.Sigmoid, nn.Tanh
|
||||
]:
|
||||
ACTIVATION_LAYERS.register_module(module=module)
|
||||
|
||||
|
||||
@ACTIVATION_LAYERS.register_module(name='Clip')
|
||||
@ACTIVATION_LAYERS.register_module()
|
||||
class Clamp(nn.Module):
|
||||
"""Clamp activation layer.
|
||||
|
||||
This activation function is to clamp the feature map value within
|
||||
:math:`[min, max]`. More details can be found in ``torch.clamp()``.
|
||||
|
||||
Args:
|
||||
min (Number | optional): Lower-bound of the range to be clamped to.
|
||||
Default to -1.
|
||||
max (Number | optional): Upper-bound of the range to be clamped to.
|
||||
Default to 1.
|
||||
"""
|
||||
|
||||
def __init__(self, min=-1., max=1.):
|
||||
super(Clamp, self).__init__()
|
||||
self.min = min
|
||||
self.max = max
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward function.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): The input tensor.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Clamped tensor.
|
||||
"""
|
||||
return torch.clamp(x, min=self.min, max=self.max)
|
||||
|
||||
|
||||
class GELU(nn.Module):
|
||||
r"""Applies the Gaussian Error Linear Units function:
|
||||
|
||||
.. math::
|
||||
\text{GELU}(x) = x * \Phi(x)
|
||||
where :math:`\Phi(x)` is the Cumulative Distribution Function for
|
||||
Gaussian Distribution.
|
||||
|
||||
Shape:
|
||||
- Input: :math:`(N, *)` where `*` means, any number of additional
|
||||
dimensions
|
||||
- Output: :math:`(N, *)`, same shape as the input
|
||||
|
||||
.. image:: scripts/activation_images/GELU.png
|
||||
|
||||
Examples::
|
||||
|
||||
>>> m = nn.GELU()
|
||||
>>> input = torch.randn(2)
|
||||
>>> output = m(input)
|
||||
"""
|
||||
|
||||
def forward(self, input):
|
||||
return F.gelu(input)
|
||||
|
||||
|
||||
if (TORCH_VERSION == 'parrots'
|
||||
or digit_version(TORCH_VERSION) < digit_version('1.4')):
|
||||
ACTIVATION_LAYERS.register_module(module=GELU)
|
||||
else:
|
||||
ACTIVATION_LAYERS.register_module(module=nn.GELU)
|
||||
|
||||
|
||||
def build_activation_layer(cfg):
|
||||
"""Build activation layer.
|
||||
|
||||
Args:
|
||||
cfg (dict): The activation layer config, which should contain:
|
||||
- type (str): Layer type.
|
||||
- layer args: Args needed to instantiate an activation layer.
|
||||
|
||||
Returns:
|
||||
nn.Module: Created activation layer.
|
||||
"""
|
||||
return build_from_cfg(cfg, ACTIVATION_LAYERS)
|
||||
125
comfy/annotator/uniformer/mmcv/cnn/bricks/context_block.py
Normal file
125
comfy/annotator/uniformer/mmcv/cnn/bricks/context_block.py
Normal file
@ -0,0 +1,125 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from ..utils import constant_init, kaiming_init
|
||||
from .registry import PLUGIN_LAYERS
|
||||
|
||||
|
||||
def last_zero_init(m):
|
||||
if isinstance(m, nn.Sequential):
|
||||
constant_init(m[-1], val=0)
|
||||
else:
|
||||
constant_init(m, val=0)
|
||||
|
||||
|
||||
@PLUGIN_LAYERS.register_module()
|
||||
class ContextBlock(nn.Module):
|
||||
"""ContextBlock module in GCNet.
|
||||
|
||||
See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
|
||||
(https://arxiv.org/abs/1904.11492) for details.
|
||||
|
||||
Args:
|
||||
in_channels (int): Channels of the input feature map.
|
||||
ratio (float): Ratio of channels of transform bottleneck
|
||||
pooling_type (str): Pooling method for context modeling.
|
||||
Options are 'att' and 'avg', stand for attention pooling and
|
||||
average pooling respectively. Default: 'att'.
|
||||
fusion_types (Sequence[str]): Fusion method for feature fusion,
|
||||
Options are 'channels_add', 'channel_mul', stand for channelwise
|
||||
addition and multiplication respectively. Default: ('channel_add',)
|
||||
"""
|
||||
|
||||
_abbr_ = 'context_block'
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
ratio,
|
||||
pooling_type='att',
|
||||
fusion_types=('channel_add', )):
|
||||
super(ContextBlock, self).__init__()
|
||||
assert pooling_type in ['avg', 'att']
|
||||
assert isinstance(fusion_types, (list, tuple))
|
||||
valid_fusion_types = ['channel_add', 'channel_mul']
|
||||
assert all([f in valid_fusion_types for f in fusion_types])
|
||||
assert len(fusion_types) > 0, 'at least one fusion should be used'
|
||||
self.in_channels = in_channels
|
||||
self.ratio = ratio
|
||||
self.planes = int(in_channels * ratio)
|
||||
self.pooling_type = pooling_type
|
||||
self.fusion_types = fusion_types
|
||||
if pooling_type == 'att':
|
||||
self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1)
|
||||
self.softmax = nn.Softmax(dim=2)
|
||||
else:
|
||||
self.avg_pool = nn.AdaptiveAvgPool2d(1)
|
||||
if 'channel_add' in fusion_types:
|
||||
self.channel_add_conv = nn.Sequential(
|
||||
nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
|
||||
nn.LayerNorm([self.planes, 1, 1]),
|
||||
nn.ReLU(inplace=True), # yapf: disable
|
||||
nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
|
||||
else:
|
||||
self.channel_add_conv = None
|
||||
if 'channel_mul' in fusion_types:
|
||||
self.channel_mul_conv = nn.Sequential(
|
||||
nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
|
||||
nn.LayerNorm([self.planes, 1, 1]),
|
||||
nn.ReLU(inplace=True), # yapf: disable
|
||||
nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
|
||||
else:
|
||||
self.channel_mul_conv = None
|
||||
self.reset_parameters()
|
||||
|
||||
def reset_parameters(self):
|
||||
if self.pooling_type == 'att':
|
||||
kaiming_init(self.conv_mask, mode='fan_in')
|
||||
self.conv_mask.inited = True
|
||||
|
||||
if self.channel_add_conv is not None:
|
||||
last_zero_init(self.channel_add_conv)
|
||||
if self.channel_mul_conv is not None:
|
||||
last_zero_init(self.channel_mul_conv)
|
||||
|
||||
def spatial_pool(self, x):
|
||||
batch, channel, height, width = x.size()
|
||||
if self.pooling_type == 'att':
|
||||
input_x = x
|
||||
# [N, C, H * W]
|
||||
input_x = input_x.view(batch, channel, height * width)
|
||||
# [N, 1, C, H * W]
|
||||
input_x = input_x.unsqueeze(1)
|
||||
# [N, 1, H, W]
|
||||
context_mask = self.conv_mask(x)
|
||||
# [N, 1, H * W]
|
||||
context_mask = context_mask.view(batch, 1, height * width)
|
||||
# [N, 1, H * W]
|
||||
context_mask = self.softmax(context_mask)
|
||||
# [N, 1, H * W, 1]
|
||||
context_mask = context_mask.unsqueeze(-1)
|
||||
# [N, 1, C, 1]
|
||||
context = torch.matmul(input_x, context_mask)
|
||||
# [N, C, 1, 1]
|
||||
context = context.view(batch, channel, 1, 1)
|
||||
else:
|
||||
# [N, C, 1, 1]
|
||||
context = self.avg_pool(x)
|
||||
|
||||
return context
|
||||
|
||||
def forward(self, x):
|
||||
# [N, C, 1, 1]
|
||||
context = self.spatial_pool(x)
|
||||
|
||||
out = x
|
||||
if self.channel_mul_conv is not None:
|
||||
# [N, C, 1, 1]
|
||||
channel_mul_term = torch.sigmoid(self.channel_mul_conv(context))
|
||||
out = out * channel_mul_term
|
||||
if self.channel_add_conv is not None:
|
||||
# [N, C, 1, 1]
|
||||
channel_add_term = self.channel_add_conv(context)
|
||||
out = out + channel_add_term
|
||||
|
||||
return out
|
||||
44
comfy/annotator/uniformer/mmcv/cnn/bricks/conv.py
Normal file
44
comfy/annotator/uniformer/mmcv/cnn/bricks/conv.py
Normal file
@ -0,0 +1,44 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
from torch import nn
|
||||
|
||||
from .registry import CONV_LAYERS
|
||||
|
||||
CONV_LAYERS.register_module('Conv1d', module=nn.Conv1d)
|
||||
CONV_LAYERS.register_module('Conv2d', module=nn.Conv2d)
|
||||
CONV_LAYERS.register_module('Conv3d', module=nn.Conv3d)
|
||||
CONV_LAYERS.register_module('Conv', module=nn.Conv2d)
|
||||
|
||||
|
||||
def build_conv_layer(cfg, *args, **kwargs):
|
||||
"""Build convolution layer.
|
||||
|
||||
Args:
|
||||
cfg (None or dict): The conv layer config, which should contain:
|
||||
- type (str): Layer type.
|
||||
- layer args: Args needed to instantiate an conv layer.
|
||||
args (argument list): Arguments passed to the `__init__`
|
||||
method of the corresponding conv layer.
|
||||
kwargs (keyword arguments): Keyword arguments passed to the `__init__`
|
||||
method of the corresponding conv layer.
|
||||
|
||||
Returns:
|
||||
nn.Module: Created conv layer.
|
||||
"""
|
||||
if cfg is None:
|
||||
cfg_ = dict(type='Conv2d')
|
||||
else:
|
||||
if not isinstance(cfg, dict):
|
||||
raise TypeError('cfg must be a dict')
|
||||
if 'type' not in cfg:
|
||||
raise KeyError('the cfg dict must contain the key "type"')
|
||||
cfg_ = cfg.copy()
|
||||
|
||||
layer_type = cfg_.pop('type')
|
||||
if layer_type not in CONV_LAYERS:
|
||||
raise KeyError(f'Unrecognized norm type {layer_type}')
|
||||
else:
|
||||
conv_layer = CONV_LAYERS.get(layer_type)
|
||||
|
||||
layer = conv_layer(*args, **kwargs, **cfg_)
|
||||
|
||||
return layer
|
||||
@ -0,0 +1,62 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import math
|
||||
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from .registry import CONV_LAYERS
|
||||
|
||||
|
||||
@CONV_LAYERS.register_module()
|
||||
class Conv2dAdaptivePadding(nn.Conv2d):
|
||||
"""Implementation of 2D convolution in tensorflow with `padding` as "same",
|
||||
which applies padding to input (if needed) so that input image gets fully
|
||||
covered by filter and stride you specified. For stride 1, this will ensure
|
||||
that output image size is same as input. For stride of 2, output dimensions
|
||||
will be half, for example.
|
||||
|
||||
Args:
|
||||
in_channels (int): Number of channels in the input image
|
||||
out_channels (int): Number of channels produced by the convolution
|
||||
kernel_size (int or tuple): Size of the convolving kernel
|
||||
stride (int or tuple, optional): Stride of the convolution. Default: 1
|
||||
padding (int or tuple, optional): Zero-padding added to both sides of
|
||||
the input. Default: 0
|
||||
dilation (int or tuple, optional): Spacing between kernel elements.
|
||||
Default: 1
|
||||
groups (int, optional): Number of blocked connections from input
|
||||
channels to output channels. Default: 1
|
||||
bias (bool, optional): If ``True``, adds a learnable bias to the
|
||||
output. Default: ``True``
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
bias=True):
|
||||
super().__init__(in_channels, out_channels, kernel_size, stride, 0,
|
||||
dilation, groups, bias)
|
||||
|
||||
def forward(self, x):
|
||||
img_h, img_w = x.size()[-2:]
|
||||
kernel_h, kernel_w = self.weight.size()[-2:]
|
||||
stride_h, stride_w = self.stride
|
||||
output_h = math.ceil(img_h / stride_h)
|
||||
output_w = math.ceil(img_w / stride_w)
|
||||
pad_h = (
|
||||
max((output_h - 1) * self.stride[0] +
|
||||
(kernel_h - 1) * self.dilation[0] + 1 - img_h, 0))
|
||||
pad_w = (
|
||||
max((output_w - 1) * self.stride[1] +
|
||||
(kernel_w - 1) * self.dilation[1] + 1 - img_w, 0))
|
||||
if pad_h > 0 or pad_w > 0:
|
||||
x = F.pad(x, [
|
||||
pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
|
||||
])
|
||||
return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
|
||||
self.dilation, self.groups)
|
||||
206
comfy/annotator/uniformer/mmcv/cnn/bricks/conv_module.py
Normal file
206
comfy/annotator/uniformer/mmcv/cnn/bricks/conv_module.py
Normal file
@ -0,0 +1,206 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import warnings
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
from annotator.uniformer.mmcv.utils import _BatchNorm, _InstanceNorm
|
||||
from ..utils import constant_init, kaiming_init
|
||||
from .activation import build_activation_layer
|
||||
from .conv import build_conv_layer
|
||||
from .norm import build_norm_layer
|
||||
from .padding import build_padding_layer
|
||||
from .registry import PLUGIN_LAYERS
|
||||
|
||||
|
||||
@PLUGIN_LAYERS.register_module()
|
||||
class ConvModule(nn.Module):
|
||||
"""A conv block that bundles conv/norm/activation layers.
|
||||
|
||||
This block simplifies the usage of convolution layers, which are commonly
|
||||
used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
|
||||
It is based upon three build methods: `build_conv_layer()`,
|
||||
`build_norm_layer()` and `build_activation_layer()`.
|
||||
|
||||
Besides, we add some additional features in this module.
|
||||
1. Automatically set `bias` of the conv layer.
|
||||
2. Spectral norm is supported.
|
||||
3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
|
||||
supports zero and circular padding, and we add "reflect" padding mode.
|
||||
|
||||
Args:
|
||||
in_channels (int): Number of channels in the input feature map.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
out_channels (int): Number of channels produced by the convolution.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
kernel_size (int | tuple[int]): Size of the convolving kernel.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
stride (int | tuple[int]): Stride of the convolution.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
padding (int | tuple[int]): Zero-padding added to both sides of
|
||||
the input. Same as that in ``nn._ConvNd``.
|
||||
dilation (int | tuple[int]): Spacing between kernel elements.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
groups (int): Number of blocked connections from input channels to
|
||||
output channels. Same as that in ``nn._ConvNd``.
|
||||
bias (bool | str): If specified as `auto`, it will be decided by the
|
||||
norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
|
||||
False. Default: "auto".
|
||||
conv_cfg (dict): Config dict for convolution layer. Default: None,
|
||||
which means using conv2d.
|
||||
norm_cfg (dict): Config dict for normalization layer. Default: None.
|
||||
act_cfg (dict): Config dict for activation layer.
|
||||
Default: dict(type='ReLU').
|
||||
inplace (bool): Whether to use inplace mode for activation.
|
||||
Default: True.
|
||||
with_spectral_norm (bool): Whether use spectral norm in conv module.
|
||||
Default: False.
|
||||
padding_mode (str): If the `padding_mode` has not been supported by
|
||||
current `Conv2d` in PyTorch, we will use our own padding layer
|
||||
instead. Currently, we support ['zeros', 'circular'] with official
|
||||
implementation and ['reflect'] with our own implementation.
|
||||
Default: 'zeros'.
|
||||
order (tuple[str]): The order of conv/norm/activation layers. It is a
|
||||
sequence of "conv", "norm" and "act". Common examples are
|
||||
("conv", "norm", "act") and ("act", "conv", "norm").
|
||||
Default: ('conv', 'norm', 'act').
|
||||
"""
|
||||
|
||||
_abbr_ = 'conv_block'
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
bias='auto',
|
||||
conv_cfg=None,
|
||||
norm_cfg=None,
|
||||
act_cfg=dict(type='ReLU'),
|
||||
inplace=True,
|
||||
with_spectral_norm=False,
|
||||
padding_mode='zeros',
|
||||
order=('conv', 'norm', 'act')):
|
||||
super(ConvModule, self).__init__()
|
||||
assert conv_cfg is None or isinstance(conv_cfg, dict)
|
||||
assert norm_cfg is None or isinstance(norm_cfg, dict)
|
||||
assert act_cfg is None or isinstance(act_cfg, dict)
|
||||
official_padding_mode = ['zeros', 'circular']
|
||||
self.conv_cfg = conv_cfg
|
||||
self.norm_cfg = norm_cfg
|
||||
self.act_cfg = act_cfg
|
||||
self.inplace = inplace
|
||||
self.with_spectral_norm = with_spectral_norm
|
||||
self.with_explicit_padding = padding_mode not in official_padding_mode
|
||||
self.order = order
|
||||
assert isinstance(self.order, tuple) and len(self.order) == 3
|
||||
assert set(order) == set(['conv', 'norm', 'act'])
|
||||
|
||||
self.with_norm = norm_cfg is not None
|
||||
self.with_activation = act_cfg is not None
|
||||
# if the conv layer is before a norm layer, bias is unnecessary.
|
||||
if bias == 'auto':
|
||||
bias = not self.with_norm
|
||||
self.with_bias = bias
|
||||
|
||||
if self.with_explicit_padding:
|
||||
pad_cfg = dict(type=padding_mode)
|
||||
self.padding_layer = build_padding_layer(pad_cfg, padding)
|
||||
|
||||
# reset padding to 0 for conv module
|
||||
conv_padding = 0 if self.with_explicit_padding else padding
|
||||
# build convolution layer
|
||||
self.conv = build_conv_layer(
|
||||
conv_cfg,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=stride,
|
||||
padding=conv_padding,
|
||||
dilation=dilation,
|
||||
groups=groups,
|
||||
bias=bias)
|
||||
# export the attributes of self.conv to a higher level for convenience
|
||||
self.in_channels = self.conv.in_channels
|
||||
self.out_channels = self.conv.out_channels
|
||||
self.kernel_size = self.conv.kernel_size
|
||||
self.stride = self.conv.stride
|
||||
self.padding = padding
|
||||
self.dilation = self.conv.dilation
|
||||
self.transposed = self.conv.transposed
|
||||
self.output_padding = self.conv.output_padding
|
||||
self.groups = self.conv.groups
|
||||
|
||||
if self.with_spectral_norm:
|
||||
self.conv = nn.utils.spectral_norm(self.conv)
|
||||
|
||||
# build normalization layers
|
||||
if self.with_norm:
|
||||
# norm layer is after conv layer
|
||||
if order.index('norm') > order.index('conv'):
|
||||
norm_channels = out_channels
|
||||
else:
|
||||
norm_channels = in_channels
|
||||
self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
|
||||
self.add_module(self.norm_name, norm)
|
||||
if self.with_bias:
|
||||
if isinstance(norm, (_BatchNorm, _InstanceNorm)):
|
||||
warnings.warn(
|
||||
'Unnecessary conv bias before batch/instance norm')
|
||||
else:
|
||||
self.norm_name = None
|
||||
|
||||
# build activation layer
|
||||
if self.with_activation:
|
||||
act_cfg_ = act_cfg.copy()
|
||||
# nn.Tanh has no 'inplace' argument
|
||||
if act_cfg_['type'] not in [
|
||||
'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish'
|
||||
]:
|
||||
act_cfg_.setdefault('inplace', inplace)
|
||||
self.activate = build_activation_layer(act_cfg_)
|
||||
|
||||
# Use msra init by default
|
||||
self.init_weights()
|
||||
|
||||
@property
|
||||
def norm(self):
|
||||
if self.norm_name:
|
||||
return getattr(self, self.norm_name)
|
||||
else:
|
||||
return None
|
||||
|
||||
def init_weights(self):
|
||||
# 1. It is mainly for customized conv layers with their own
|
||||
# initialization manners by calling their own ``init_weights()``,
|
||||
# and we do not want ConvModule to override the initialization.
|
||||
# 2. For customized conv layers without their own initialization
|
||||
# manners (that is, they don't have their own ``init_weights()``)
|
||||
# and PyTorch's conv layers, they will be initialized by
|
||||
# this method with default ``kaiming_init``.
|
||||
# Note: For PyTorch's conv layers, they will be overwritten by our
|
||||
# initialization implementation using default ``kaiming_init``.
|
||||
if not hasattr(self.conv, 'init_weights'):
|
||||
if self.with_activation and self.act_cfg['type'] == 'LeakyReLU':
|
||||
nonlinearity = 'leaky_relu'
|
||||
a = self.act_cfg.get('negative_slope', 0.01)
|
||||
else:
|
||||
nonlinearity = 'relu'
|
||||
a = 0
|
||||
kaiming_init(self.conv, a=a, nonlinearity=nonlinearity)
|
||||
if self.with_norm:
|
||||
constant_init(self.norm, 1, bias=0)
|
||||
|
||||
def forward(self, x, activate=True, norm=True):
|
||||
for layer in self.order:
|
||||
if layer == 'conv':
|
||||
if self.with_explicit_padding:
|
||||
x = self.padding_layer(x)
|
||||
x = self.conv(x)
|
||||
elif layer == 'norm' and norm and self.with_norm:
|
||||
x = self.norm(x)
|
||||
elif layer == 'act' and activate and self.with_activation:
|
||||
x = self.activate(x)
|
||||
return x
|
||||
148
comfy/annotator/uniformer/mmcv/cnn/bricks/conv_ws.py
Normal file
148
comfy/annotator/uniformer/mmcv/cnn/bricks/conv_ws.py
Normal file
@ -0,0 +1,148 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .registry import CONV_LAYERS
|
||||
|
||||
|
||||
def conv_ws_2d(input,
|
||||
weight,
|
||||
bias=None,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
eps=1e-5):
|
||||
c_in = weight.size(0)
|
||||
weight_flat = weight.view(c_in, -1)
|
||||
mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
|
||||
std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
|
||||
weight = (weight - mean) / (std + eps)
|
||||
return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
|
||||
|
||||
|
||||
@CONV_LAYERS.register_module('ConvWS')
|
||||
class ConvWS2d(nn.Conv2d):
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
bias=True,
|
||||
eps=1e-5):
|
||||
super(ConvWS2d, self).__init__(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
dilation=dilation,
|
||||
groups=groups,
|
||||
bias=bias)
|
||||
self.eps = eps
|
||||
|
||||
def forward(self, x):
|
||||
return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
|
||||
self.dilation, self.groups, self.eps)
|
||||
|
||||
|
||||
@CONV_LAYERS.register_module(name='ConvAWS')
|
||||
class ConvAWS2d(nn.Conv2d):
|
||||
"""AWS (Adaptive Weight Standardization)
|
||||
|
||||
This is a variant of Weight Standardization
|
||||
(https://arxiv.org/pdf/1903.10520.pdf)
|
||||
It is used in DetectoRS to avoid NaN
|
||||
(https://arxiv.org/pdf/2006.02334.pdf)
|
||||
|
||||
Args:
|
||||
in_channels (int): Number of channels in the input image
|
||||
out_channels (int): Number of channels produced by the convolution
|
||||
kernel_size (int or tuple): Size of the conv kernel
|
||||
stride (int or tuple, optional): Stride of the convolution. Default: 1
|
||||
padding (int or tuple, optional): Zero-padding added to both sides of
|
||||
the input. Default: 0
|
||||
dilation (int or tuple, optional): Spacing between kernel elements.
|
||||
Default: 1
|
||||
groups (int, optional): Number of blocked connections from input
|
||||
channels to output channels. Default: 1
|
||||
bias (bool, optional): If set True, adds a learnable bias to the
|
||||
output. Default: True
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
bias=True):
|
||||
super().__init__(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
dilation=dilation,
|
||||
groups=groups,
|
||||
bias=bias)
|
||||
self.register_buffer('weight_gamma',
|
||||
torch.ones(self.out_channels, 1, 1, 1))
|
||||
self.register_buffer('weight_beta',
|
||||
torch.zeros(self.out_channels, 1, 1, 1))
|
||||
|
||||
def _get_weight(self, weight):
|
||||
weight_flat = weight.view(weight.size(0), -1)
|
||||
mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
|
||||
std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
|
||||
weight = (weight - mean) / std
|
||||
weight = self.weight_gamma * weight + self.weight_beta
|
||||
return weight
|
||||
|
||||
def forward(self, x):
|
||||
weight = self._get_weight(self.weight)
|
||||
return F.conv2d(x, weight, self.bias, self.stride, self.padding,
|
||||
self.dilation, self.groups)
|
||||
|
||||
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
|
||||
missing_keys, unexpected_keys, error_msgs):
|
||||
"""Override default load function.
|
||||
|
||||
AWS overrides the function _load_from_state_dict to recover
|
||||
weight_gamma and weight_beta if they are missing. If weight_gamma and
|
||||
weight_beta are found in the checkpoint, this function will return
|
||||
after super()._load_from_state_dict. Otherwise, it will compute the
|
||||
mean and std of the pretrained weights and store them in weight_beta
|
||||
and weight_gamma.
|
||||
"""
|
||||
|
||||
self.weight_gamma.data.fill_(-1)
|
||||
local_missing_keys = []
|
||||
super()._load_from_state_dict(state_dict, prefix, local_metadata,
|
||||
strict, local_missing_keys,
|
||||
unexpected_keys, error_msgs)
|
||||
if self.weight_gamma.data.mean() > 0:
|
||||
for k in local_missing_keys:
|
||||
missing_keys.append(k)
|
||||
return
|
||||
weight = self.weight.data
|
||||
weight_flat = weight.view(weight.size(0), -1)
|
||||
mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
|
||||
std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
|
||||
self.weight_beta.data.copy_(mean)
|
||||
self.weight_gamma.data.copy_(std)
|
||||
missing_gamma_beta = [
|
||||
k for k in local_missing_keys
|
||||
if k.endswith('weight_gamma') or k.endswith('weight_beta')
|
||||
]
|
||||
for k in missing_gamma_beta:
|
||||
local_missing_keys.remove(k)
|
||||
for k in local_missing_keys:
|
||||
missing_keys.append(k)
|
||||
@ -0,0 +1,96 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch.nn as nn
|
||||
|
||||
from .conv_module import ConvModule
|
||||
|
||||
|
||||
class DepthwiseSeparableConvModule(nn.Module):
|
||||
"""Depthwise separable convolution module.
|
||||
|
||||
See https://arxiv.org/pdf/1704.04861.pdf for details.
|
||||
|
||||
This module can replace a ConvModule with the conv block replaced by two
|
||||
conv block: depthwise conv block and pointwise conv block. The depthwise
|
||||
conv block contains depthwise-conv/norm/activation layers. The pointwise
|
||||
conv block contains pointwise-conv/norm/activation layers. It should be
|
||||
noted that there will be norm/activation layer in the depthwise conv block
|
||||
if `norm_cfg` and `act_cfg` are specified.
|
||||
|
||||
Args:
|
||||
in_channels (int): Number of channels in the input feature map.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
out_channels (int): Number of channels produced by the convolution.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
kernel_size (int | tuple[int]): Size of the convolving kernel.
|
||||
Same as that in ``nn._ConvNd``.
|
||||
stride (int | tuple[int]): Stride of the convolution.
|
||||
Same as that in ``nn._ConvNd``. Default: 1.
|
||||
padding (int | tuple[int]): Zero-padding added to both sides of
|
||||
the input. Same as that in ``nn._ConvNd``. Default: 0.
|
||||
dilation (int | tuple[int]): Spacing between kernel elements.
|
||||
Same as that in ``nn._ConvNd``. Default: 1.
|
||||
norm_cfg (dict): Default norm config for both depthwise ConvModule and
|
||||
pointwise ConvModule. Default: None.
|
||||
act_cfg (dict): Default activation config for both depthwise ConvModule
|
||||
and pointwise ConvModule. Default: dict(type='ReLU').
|
||||
dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is
|
||||
'default', it will be the same as `norm_cfg`. Default: 'default'.
|
||||
dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is
|
||||
'default', it will be the same as `act_cfg`. Default: 'default'.
|
||||
pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is
|
||||
'default', it will be the same as `norm_cfg`. Default: 'default'.
|
||||
pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is
|
||||
'default', it will be the same as `act_cfg`. Default: 'default'.
|
||||
kwargs (optional): Other shared arguments for depthwise and pointwise
|
||||
ConvModule. See ConvModule for ref.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
norm_cfg=None,
|
||||
act_cfg=dict(type='ReLU'),
|
||||
dw_norm_cfg='default',
|
||||
dw_act_cfg='default',
|
||||
pw_norm_cfg='default',
|
||||
pw_act_cfg='default',
|
||||
**kwargs):
|
||||
super(DepthwiseSeparableConvModule, self).__init__()
|
||||
assert 'groups' not in kwargs, 'groups should not be specified'
|
||||
|
||||
# if norm/activation config of depthwise/pointwise ConvModule is not
|
||||
# specified, use default config.
|
||||
dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg
|
||||
dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg
|
||||
pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg
|
||||
pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg
|
||||
|
||||
# depthwise convolution
|
||||
self.depthwise_conv = ConvModule(
|
||||
in_channels,
|
||||
in_channels,
|
||||
kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
dilation=dilation,
|
||||
groups=in_channels,
|
||||
norm_cfg=dw_norm_cfg,
|
||||
act_cfg=dw_act_cfg,
|
||||
**kwargs)
|
||||
|
||||
self.pointwise_conv = ConvModule(
|
||||
in_channels,
|
||||
out_channels,
|
||||
1,
|
||||
norm_cfg=pw_norm_cfg,
|
||||
act_cfg=pw_act_cfg,
|
||||
**kwargs)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.depthwise_conv(x)
|
||||
x = self.pointwise_conv(x)
|
||||
return x
|
||||
65
comfy/annotator/uniformer/mmcv/cnn/bricks/drop.py
Normal file
65
comfy/annotator/uniformer/mmcv/cnn/bricks/drop.py
Normal file
@ -0,0 +1,65 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from annotator.uniformer.mmcv import build_from_cfg
|
||||
from .registry import DROPOUT_LAYERS
|
||||
|
||||
|
||||
def drop_path(x, drop_prob=0., training=False):
|
||||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of
|
||||
residual blocks).
|
||||
|
||||
We follow the implementation
|
||||
https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501
|
||||
"""
|
||||
if drop_prob == 0. or not training:
|
||||
return x
|
||||
keep_prob = 1 - drop_prob
|
||||
# handle tensors with different dimensions, not just 4D tensors.
|
||||
shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
|
||||
random_tensor = keep_prob + torch.rand(
|
||||
shape, dtype=x.dtype, device=x.device)
|
||||
output = x.div(keep_prob) * random_tensor.floor()
|
||||
return output
|
||||
|
||||
|
||||
@DROPOUT_LAYERS.register_module()
|
||||
class DropPath(nn.Module):
|
||||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of
|
||||
residual blocks).
|
||||
|
||||
We follow the implementation
|
||||
https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501
|
||||
|
||||
Args:
|
||||
drop_prob (float): Probability of the path to be zeroed. Default: 0.1
|
||||
"""
|
||||
|
||||
def __init__(self, drop_prob=0.1):
|
||||
super(DropPath, self).__init__()
|
||||
self.drop_prob = drop_prob
|
||||
|
||||
def forward(self, x):
|
||||
return drop_path(x, self.drop_prob, self.training)
|
||||
|
||||
|
||||
@DROPOUT_LAYERS.register_module()
|
||||
class Dropout(nn.Dropout):
|
||||
"""A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of
|
||||
``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with
|
||||
``DropPath``
|
||||
|
||||
Args:
|
||||
drop_prob (float): Probability of the elements to be
|
||||
zeroed. Default: 0.5.
|
||||
inplace (bool): Do the operation inplace or not. Default: False.
|
||||
"""
|
||||
|
||||
def __init__(self, drop_prob=0.5, inplace=False):
|
||||
super().__init__(p=drop_prob, inplace=inplace)
|
||||
|
||||
|
||||
def build_dropout(cfg, default_args=None):
|
||||
"""Builder for drop out layers."""
|
||||
return build_from_cfg(cfg, DROPOUT_LAYERS, default_args)
|
||||
@ -0,0 +1,412 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ..utils import kaiming_init
|
||||
from .registry import PLUGIN_LAYERS
|
||||
|
||||
|
||||
@PLUGIN_LAYERS.register_module()
|
||||
class GeneralizedAttention(nn.Module):
|
||||
"""GeneralizedAttention module.
|
||||
|
||||
See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
|
||||
(https://arxiv.org/abs/1711.07971) for details.
|
||||
|
||||
Args:
|
||||
in_channels (int): Channels of the input feature map.
|
||||
spatial_range (int): The spatial range. -1 indicates no spatial range
|
||||
constraint. Default: -1.
|
||||
num_heads (int): The head number of empirical_attention module.
|
||||
Default: 9.
|
||||
position_embedding_dim (int): The position embedding dimension.
|
||||
Default: -1.
|
||||
position_magnitude (int): A multiplier acting on coord difference.
|
||||
Default: 1.
|
||||
kv_stride (int): The feature stride acting on key/value feature map.
|
||||
Default: 2.
|
||||
q_stride (int): The feature stride acting on query feature map.
|
||||
Default: 1.
|
||||
attention_type (str): A binary indicator string for indicating which
|
||||
items in generalized empirical_attention module are used.
|
||||
Default: '1111'.
|
||||
|
||||
- '1000' indicates 'query and key content' (appr - appr) item,
|
||||
- '0100' indicates 'query content and relative position'
|
||||
(appr - position) item,
|
||||
- '0010' indicates 'key content only' (bias - appr) item,
|
||||
- '0001' indicates 'relative position only' (bias - position) item.
|
||||
"""
|
||||
|
||||
_abbr_ = 'gen_attention_block'
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
spatial_range=-1,
|
||||
num_heads=9,
|
||||
position_embedding_dim=-1,
|
||||
position_magnitude=1,
|
||||
kv_stride=2,
|
||||
q_stride=1,
|
||||
attention_type='1111'):
|
||||
|
||||
super(GeneralizedAttention, self).__init__()
|
||||
|
||||
# hard range means local range for non-local operation
|
||||
self.position_embedding_dim = (
|
||||
position_embedding_dim
|
||||
if position_embedding_dim > 0 else in_channels)
|
||||
|
||||
self.position_magnitude = position_magnitude
|
||||
self.num_heads = num_heads
|
||||
self.in_channels = in_channels
|
||||
self.spatial_range = spatial_range
|
||||
self.kv_stride = kv_stride
|
||||
self.q_stride = q_stride
|
||||
self.attention_type = [bool(int(_)) for _ in attention_type]
|
||||
self.qk_embed_dim = in_channels // num_heads
|
||||
out_c = self.qk_embed_dim * num_heads
|
||||
|
||||
if self.attention_type[0] or self.attention_type[1]:
|
||||
self.query_conv = nn.Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_c,
|
||||
kernel_size=1,
|
||||
bias=False)
|
||||
self.query_conv.kaiming_init = True
|
||||
|
||||
if self.attention_type[0] or self.attention_type[2]:
|
||||
self.key_conv = nn.Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_c,
|
||||
kernel_size=1,
|
||||
bias=False)
|
||||
self.key_conv.kaiming_init = True
|
||||
|
||||
self.v_dim = in_channels // num_heads
|
||||
self.value_conv = nn.Conv2d(
|
||||
in_channels=in_channels,
|
||||
out_channels=self.v_dim * num_heads,
|
||||
kernel_size=1,
|
||||
bias=False)
|
||||
self.value_conv.kaiming_init = True
|
||||
|
||||
if self.attention_type[1] or self.attention_type[3]:
|
||||
self.appr_geom_fc_x = nn.Linear(
|
||||
self.position_embedding_dim // 2, out_c, bias=False)
|
||||
self.appr_geom_fc_x.kaiming_init = True
|
||||
|
||||
self.appr_geom_fc_y = nn.Linear(
|
||||
self.position_embedding_dim // 2, out_c, bias=False)
|
||||
self.appr_geom_fc_y.kaiming_init = True
|
||||
|
||||
if self.attention_type[2]:
|
||||
stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
|
||||
appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv
|
||||
self.appr_bias = nn.Parameter(appr_bias_value)
|
||||
|
||||
if self.attention_type[3]:
|
||||
stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
|
||||
geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv
|
||||
self.geom_bias = nn.Parameter(geom_bias_value)
|
||||
|
||||
self.proj_conv = nn.Conv2d(
|
||||
in_channels=self.v_dim * num_heads,
|
||||
out_channels=in_channels,
|
||||
kernel_size=1,
|
||||
bias=True)
|
||||
self.proj_conv.kaiming_init = True
|
||||
self.gamma = nn.Parameter(torch.zeros(1))
|
||||
|
||||
if self.spatial_range >= 0:
|
||||
# only works when non local is after 3*3 conv
|
||||
if in_channels == 256:
|
||||
max_len = 84
|
||||
elif in_channels == 512:
|
||||
max_len = 42
|
||||
|
||||
max_len_kv = int((max_len - 1.0) / self.kv_stride + 1)
|
||||
local_constraint_map = np.ones(
|
||||
(max_len, max_len, max_len_kv, max_len_kv), dtype=np.int)
|
||||
for iy in range(max_len):
|
||||
for ix in range(max_len):
|
||||
local_constraint_map[
|
||||
iy, ix,
|
||||
max((iy - self.spatial_range) //
|
||||
self.kv_stride, 0):min((iy + self.spatial_range +
|
||||
1) // self.kv_stride +
|
||||
1, max_len),
|
||||
max((ix - self.spatial_range) //
|
||||
self.kv_stride, 0):min((ix + self.spatial_range +
|
||||
1) // self.kv_stride +
|
||||
1, max_len)] = 0
|
||||
|
||||
self.local_constraint_map = nn.Parameter(
|
||||
torch.from_numpy(local_constraint_map).byte(),
|
||||
requires_grad=False)
|
||||
|
||||
if self.q_stride > 1:
|
||||
self.q_downsample = nn.AvgPool2d(
|
||||
kernel_size=1, stride=self.q_stride)
|
||||
else:
|
||||
self.q_downsample = None
|
||||
|
||||
if self.kv_stride > 1:
|
||||
self.kv_downsample = nn.AvgPool2d(
|
||||
kernel_size=1, stride=self.kv_stride)
|
||||
else:
|
||||
self.kv_downsample = None
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def get_position_embedding(self,
|
||||
h,
|
||||
w,
|
||||
h_kv,
|
||||
w_kv,
|
||||
q_stride,
|
||||
kv_stride,
|
||||
device,
|
||||
dtype,
|
||||
feat_dim,
|
||||
wave_length=1000):
|
||||
# the default type of Tensor is float32, leading to type mismatch
|
||||
# in fp16 mode. Cast it to support fp16 mode.
|
||||
h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype)
|
||||
h_idxs = h_idxs.view((h, 1)) * q_stride
|
||||
|
||||
w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype)
|
||||
w_idxs = w_idxs.view((w, 1)) * q_stride
|
||||
|
||||
h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to(
|
||||
device=device, dtype=dtype)
|
||||
h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride
|
||||
|
||||
w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to(
|
||||
device=device, dtype=dtype)
|
||||
w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride
|
||||
|
||||
# (h, h_kv, 1)
|
||||
h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0)
|
||||
h_diff *= self.position_magnitude
|
||||
|
||||
# (w, w_kv, 1)
|
||||
w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0)
|
||||
w_diff *= self.position_magnitude
|
||||
|
||||
feat_range = torch.arange(0, feat_dim / 4).to(
|
||||
device=device, dtype=dtype)
|
||||
|
||||
dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype)
|
||||
dim_mat = dim_mat**((4. / feat_dim) * feat_range)
|
||||
dim_mat = dim_mat.view((1, 1, -1))
|
||||
|
||||
embedding_x = torch.cat(
|
||||
((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2)
|
||||
|
||||
embedding_y = torch.cat(
|
||||
((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2)
|
||||
|
||||
return embedding_x, embedding_y
|
||||
|
||||
def forward(self, x_input):
|
||||
num_heads = self.num_heads
|
||||
|
||||
# use empirical_attention
|
||||
if self.q_downsample is not None:
|
||||
x_q = self.q_downsample(x_input)
|
||||
else:
|
||||
x_q = x_input
|
||||
n, _, h, w = x_q.shape
|
||||
|
||||
if self.kv_downsample is not None:
|
||||
x_kv = self.kv_downsample(x_input)
|
||||
else:
|
||||
x_kv = x_input
|
||||
_, _, h_kv, w_kv = x_kv.shape
|
||||
|
||||
if self.attention_type[0] or self.attention_type[1]:
|
||||
proj_query = self.query_conv(x_q).view(
|
||||
(n, num_heads, self.qk_embed_dim, h * w))
|
||||
proj_query = proj_query.permute(0, 1, 3, 2)
|
||||
|
||||
if self.attention_type[0] or self.attention_type[2]:
|
||||
proj_key = self.key_conv(x_kv).view(
|
||||
(n, num_heads, self.qk_embed_dim, h_kv * w_kv))
|
||||
|
||||
if self.attention_type[1] or self.attention_type[3]:
|
||||
position_embed_x, position_embed_y = self.get_position_embedding(
|
||||
h, w, h_kv, w_kv, self.q_stride, self.kv_stride,
|
||||
x_input.device, x_input.dtype, self.position_embedding_dim)
|
||||
# (n, num_heads, w, w_kv, dim)
|
||||
position_feat_x = self.appr_geom_fc_x(position_embed_x).\
|
||||
view(1, w, w_kv, num_heads, self.qk_embed_dim).\
|
||||
permute(0, 3, 1, 2, 4).\
|
||||
repeat(n, 1, 1, 1, 1)
|
||||
|
||||
# (n, num_heads, h, h_kv, dim)
|
||||
position_feat_y = self.appr_geom_fc_y(position_embed_y).\
|
||||
view(1, h, h_kv, num_heads, self.qk_embed_dim).\
|
||||
permute(0, 3, 1, 2, 4).\
|
||||
repeat(n, 1, 1, 1, 1)
|
||||
|
||||
position_feat_x /= math.sqrt(2)
|
||||
position_feat_y /= math.sqrt(2)
|
||||
|
||||
# accelerate for saliency only
|
||||
if (np.sum(self.attention_type) == 1) and self.attention_type[2]:
|
||||
appr_bias = self.appr_bias.\
|
||||
view(1, num_heads, 1, self.qk_embed_dim).\
|
||||
repeat(n, 1, 1, 1)
|
||||
|
||||
energy = torch.matmul(appr_bias, proj_key).\
|
||||
view(n, num_heads, 1, h_kv * w_kv)
|
||||
|
||||
h = 1
|
||||
w = 1
|
||||
else:
|
||||
# (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for
|
||||
if not self.attention_type[0]:
|
||||
energy = torch.zeros(
|
||||
n,
|
||||
num_heads,
|
||||
h,
|
||||
w,
|
||||
h_kv,
|
||||
w_kv,
|
||||
dtype=x_input.dtype,
|
||||
device=x_input.device)
|
||||
|
||||
# attention_type[0]: appr - appr
|
||||
# attention_type[1]: appr - position
|
||||
# attention_type[2]: bias - appr
|
||||
# attention_type[3]: bias - position
|
||||
if self.attention_type[0] or self.attention_type[2]:
|
||||
if self.attention_type[0] and self.attention_type[2]:
|
||||
appr_bias = self.appr_bias.\
|
||||
view(1, num_heads, 1, self.qk_embed_dim)
|
||||
energy = torch.matmul(proj_query + appr_bias, proj_key).\
|
||||
view(n, num_heads, h, w, h_kv, w_kv)
|
||||
|
||||
elif self.attention_type[0]:
|
||||
energy = torch.matmul(proj_query, proj_key).\
|
||||
view(n, num_heads, h, w, h_kv, w_kv)
|
||||
|
||||
elif self.attention_type[2]:
|
||||
appr_bias = self.appr_bias.\
|
||||
view(1, num_heads, 1, self.qk_embed_dim).\
|
||||
repeat(n, 1, 1, 1)
|
||||
|
||||
energy += torch.matmul(appr_bias, proj_key).\
|
||||
view(n, num_heads, 1, 1, h_kv, w_kv)
|
||||
|
||||
if self.attention_type[1] or self.attention_type[3]:
|
||||
if self.attention_type[1] and self.attention_type[3]:
|
||||
geom_bias = self.geom_bias.\
|
||||
view(1, num_heads, 1, self.qk_embed_dim)
|
||||
|
||||
proj_query_reshape = (proj_query + geom_bias).\
|
||||
view(n, num_heads, h, w, self.qk_embed_dim)
|
||||
|
||||
energy_x = torch.matmul(
|
||||
proj_query_reshape.permute(0, 1, 3, 2, 4),
|
||||
position_feat_x.permute(0, 1, 2, 4, 3))
|
||||
energy_x = energy_x.\
|
||||
permute(0, 1, 3, 2, 4).unsqueeze(4)
|
||||
|
||||
energy_y = torch.matmul(
|
||||
proj_query_reshape,
|
||||
position_feat_y.permute(0, 1, 2, 4, 3))
|
||||
energy_y = energy_y.unsqueeze(5)
|
||||
|
||||
energy += energy_x + energy_y
|
||||
|
||||
elif self.attention_type[1]:
|
||||
proj_query_reshape = proj_query.\
|
||||
view(n, num_heads, h, w, self.qk_embed_dim)
|
||||
proj_query_reshape = proj_query_reshape.\
|
||||
permute(0, 1, 3, 2, 4)
|
||||
position_feat_x_reshape = position_feat_x.\
|
||||
permute(0, 1, 2, 4, 3)
|
||||
position_feat_y_reshape = position_feat_y.\
|
||||
permute(0, 1, 2, 4, 3)
|
||||
|
||||
energy_x = torch.matmul(proj_query_reshape,
|
||||
position_feat_x_reshape)
|
||||
energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4)
|
||||
|
||||
energy_y = torch.matmul(proj_query_reshape,
|
||||
position_feat_y_reshape)
|
||||
energy_y = energy_y.unsqueeze(5)
|
||||
|
||||
energy += energy_x + energy_y
|
||||
|
||||
elif self.attention_type[3]:
|
||||
geom_bias = self.geom_bias.\
|
||||
view(1, num_heads, self.qk_embed_dim, 1).\
|
||||
repeat(n, 1, 1, 1)
|
||||
|
||||
position_feat_x_reshape = position_feat_x.\
|
||||
view(n, num_heads, w*w_kv, self.qk_embed_dim)
|
||||
|
||||
position_feat_y_reshape = position_feat_y.\
|
||||
view(n, num_heads, h * h_kv, self.qk_embed_dim)
|
||||
|
||||
energy_x = torch.matmul(position_feat_x_reshape, geom_bias)
|
||||
energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv)
|
||||
|
||||
energy_y = torch.matmul(position_feat_y_reshape, geom_bias)
|
||||
energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1)
|
||||
|
||||
energy += energy_x + energy_y
|
||||
|
||||
energy = energy.view(n, num_heads, h * w, h_kv * w_kv)
|
||||
|
||||
if self.spatial_range >= 0:
|
||||
cur_local_constraint_map = \
|
||||
self.local_constraint_map[:h, :w, :h_kv, :w_kv].\
|
||||
contiguous().\
|
||||
view(1, 1, h*w, h_kv*w_kv)
|
||||
|
||||
energy = energy.masked_fill_(cur_local_constraint_map,
|
||||
float('-inf'))
|
||||
|
||||
attention = F.softmax(energy, 3)
|
||||
|
||||
proj_value = self.value_conv(x_kv)
|
||||
proj_value_reshape = proj_value.\
|
||||
view((n, num_heads, self.v_dim, h_kv * w_kv)).\
|
||||
permute(0, 1, 3, 2)
|
||||
|
||||
out = torch.matmul(attention, proj_value_reshape).\
|
||||
permute(0, 1, 3, 2).\
|
||||
contiguous().\
|
||||
view(n, self.v_dim * self.num_heads, h, w)
|
||||
|
||||
out = self.proj_conv(out)
|
||||
|
||||
# output is downsampled, upsample back to input size
|
||||
if self.q_downsample is not None:
|
||||
out = F.interpolate(
|
||||
out,
|
||||
size=x_input.shape[2:],
|
||||
mode='bilinear',
|
||||
align_corners=False)
|
||||
|
||||
out = self.gamma * out + x_input
|
||||
return out
|
||||
|
||||
def init_weights(self):
|
||||
for m in self.modules():
|
||||
if hasattr(m, 'kaiming_init') and m.kaiming_init:
|
||||
kaiming_init(
|
||||
m,
|
||||
mode='fan_in',
|
||||
nonlinearity='leaky_relu',
|
||||
bias=0,
|
||||
distribution='uniform',
|
||||
a=1)
|
||||
34
comfy/annotator/uniformer/mmcv/cnn/bricks/hsigmoid.py
Normal file
34
comfy/annotator/uniformer/mmcv/cnn/bricks/hsigmoid.py
Normal file
@ -0,0 +1,34 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch.nn as nn
|
||||
|
||||
from .registry import ACTIVATION_LAYERS
|
||||
|
||||
|
||||
@ACTIVATION_LAYERS.register_module()
|
||||
class HSigmoid(nn.Module):
|
||||
"""Hard Sigmoid Module. Apply the hard sigmoid function:
|
||||
Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)
|
||||
Default: Hsigmoid(x) = min(max((x + 1) / 2, 0), 1)
|
||||
|
||||
Args:
|
||||
bias (float): Bias of the input feature map. Default: 1.0.
|
||||
divisor (float): Divisor of the input feature map. Default: 2.0.
|
||||
min_value (float): Lower bound value. Default: 0.0.
|
||||
max_value (float): Upper bound value. Default: 1.0.
|
||||
|
||||
Returns:
|
||||
Tensor: The output tensor.
|
||||
"""
|
||||
|
||||
def __init__(self, bias=1.0, divisor=2.0, min_value=0.0, max_value=1.0):
|
||||
super(HSigmoid, self).__init__()
|
||||
self.bias = bias
|
||||
self.divisor = divisor
|
||||
assert self.divisor != 0
|
||||
self.min_value = min_value
|
||||
self.max_value = max_value
|
||||
|
||||
def forward(self, x):
|
||||
x = (x + self.bias) / self.divisor
|
||||
|
||||
return x.clamp_(self.min_value, self.max_value)
|
||||
29
comfy/annotator/uniformer/mmcv/cnn/bricks/hswish.py
Normal file
29
comfy/annotator/uniformer/mmcv/cnn/bricks/hswish.py
Normal file
@ -0,0 +1,29 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch.nn as nn
|
||||
|
||||
from .registry import ACTIVATION_LAYERS
|
||||
|
||||
|
||||
@ACTIVATION_LAYERS.register_module()
|
||||
class HSwish(nn.Module):
|
||||
"""Hard Swish Module.
|
||||
|
||||
This module applies the hard swish function:
|
||||
|
||||
.. math::
|
||||
Hswish(x) = x * ReLU6(x + 3) / 6
|
||||
|
||||
Args:
|
||||
inplace (bool): can optionally do the operation in-place.
|
||||
Default: False.
|
||||
|
||||
Returns:
|
||||
Tensor: The output tensor.
|
||||
"""
|
||||
|
||||
def __init__(self, inplace=False):
|
||||
super(HSwish, self).__init__()
|
||||
self.act = nn.ReLU6(inplace)
|
||||
|
||||
def forward(self, x):
|
||||
return x * self.act(x + 3) / 6
|
||||
306
comfy/annotator/uniformer/mmcv/cnn/bricks/non_local.py
Normal file
306
comfy/annotator/uniformer/mmcv/cnn/bricks/non_local.py
Normal file
@ -0,0 +1,306 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
from abc import ABCMeta
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from ..utils import constant_init, normal_init
|
||||
from .conv_module import ConvModule
|
||||
from .registry import PLUGIN_LAYERS
|
||||
|
||||
|
||||
class _NonLocalNd(nn.Module, metaclass=ABCMeta):
|
||||
"""Basic Non-local module.
|
||||
|
||||
This module is proposed in
|
||||
"Non-local Neural Networks"
|
||||
Paper reference: https://arxiv.org/abs/1711.07971
|
||||
Code reference: https://github.com/AlexHex7/Non-local_pytorch
|
||||
|
||||
Args:
|
||||
in_channels (int): Channels of the input feature map.
|
||||
reduction (int): Channel reduction ratio. Default: 2.
|
||||
use_scale (bool): Whether to scale pairwise_weight by
|
||||
`1/sqrt(inter_channels)` when the mode is `embedded_gaussian`.
|
||||
Default: True.
|
||||
conv_cfg (None | dict): The config dict for convolution layers.
|
||||
If not specified, it will use `nn.Conv2d` for convolution layers.
|
||||
Default: None.
|
||||
norm_cfg (None | dict): The config dict for normalization layers.
|
||||
Default: None. (This parameter is only applicable to conv_out.)
|
||||
mode (str): Options are `gaussian`, `concatenation`,
|
||||
`embedded_gaussian` and `dot_product`. Default: embedded_gaussian.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
reduction=2,
|
||||
use_scale=True,
|
||||
conv_cfg=None,
|
||||
norm_cfg=None,
|
||||
mode='embedded_gaussian',
|
||||
**kwargs):
|
||||
super(_NonLocalNd, self).__init__()
|
||||
self.in_channels = in_channels
|
||||
self.reduction = reduction
|
||||
self.use_scale = use_scale
|
||||
self.inter_channels = max(in_channels // reduction, 1)
|
||||
self.mode = mode
|
||||
|
||||
if mode not in [
|
||||
'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation'
|
||||
]:
|
||||
raise ValueError("Mode should be in 'gaussian', 'concatenation', "
|
||||
f"'embedded_gaussian' or 'dot_product', but got "
|
||||
f'{mode} instead.')
|
||||
|
||||
# g, theta, phi are defaulted as `nn.ConvNd`.
|
||||
# Here we use ConvModule for potential usage.
|
||||
self.g = ConvModule(
|
||||
self.in_channels,
|
||||
self.inter_channels,
|
||||
kernel_size=1,
|
||||
conv_cfg=conv_cfg,
|
||||
act_cfg=None)
|
||||
self.conv_out = ConvModule(
|
||||
self.inter_channels,
|
||||
self.in_channels,
|
||||
kernel_size=1,
|
||||
conv_cfg=conv_cfg,
|
||||
norm_cfg=norm_cfg,
|
||||
act_cfg=None)
|
||||
|
||||
if self.mode != 'gaussian':
|
||||
self.theta = ConvModule(
|
||||
self.in_channels,
|
||||
self.inter_channels,
|
||||
kernel_size=1,
|
||||
conv_cfg=conv_cfg,
|
||||
act_cfg=None)
|
||||
self.phi = ConvModule(
|
||||
self.in_channels,
|
||||
self.inter_channels,
|
||||
kernel_size=1,
|
||||
conv_cfg=conv_cfg,
|
||||
act_cfg=None)
|
||||
|
||||
if self.mode == 'concatenation':
|
||||
self.concat_project = ConvModule(
|
||||
self.inter_channels * 2,
|
||||
1,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
bias=False,
|
||||
act_cfg=dict(type='ReLU'))
|
||||
|
||||
self.init_weights(**kwargs)
|
||||
|
||||
def init_weights(self, std=0.01, zeros_init=True):
|
||||
if self.mode != 'gaussian':
|
||||
for m in [self.g, self.theta, self.phi]:
|
||||
normal_init(m.conv, std=std)
|
||||
else:
|
||||
normal_init(self.g.conv, std=std)
|
||||
if zeros_init:
|
||||
if self.conv_out.norm_cfg is None:
|
||||
constant_init(self.conv_out.conv, 0)
|
||||
else:
|
||||
constant_init(self.conv_out.norm, 0)
|
||||
else:
|
||||
if self.conv_out.norm_cfg is None:
|
||||
normal_init(self.conv_out.conv, std=std)
|
||||
else:
|
||||
normal_init(self.conv_out.norm, std=std)
|
||||
|
||||
def gaussian(self, theta_x, phi_x):
|
||||
# NonLocal1d pairwise_weight: [N, H, H]
|
||||
# NonLocal2d pairwise_weight: [N, HxW, HxW]
|
||||
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
|
||||
pairwise_weight = torch.matmul(theta_x, phi_x)
|
||||
pairwise_weight = pairwise_weight.softmax(dim=-1)
|
||||
return pairwise_weight
|
||||
|
||||
def embedded_gaussian(self, theta_x, phi_x):
|
||||
# NonLocal1d pairwise_weight: [N, H, H]
|
||||
# NonLocal2d pairwise_weight: [N, HxW, HxW]
|
||||
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
|
||||
pairwise_weight = torch.matmul(theta_x, phi_x)
|
||||
if self.use_scale:
|
||||
# theta_x.shape[-1] is `self.inter_channels`
|
||||
pairwise_weight /= theta_x.shape[-1]**0.5
|
||||
pairwise_weight = pairwise_weight.softmax(dim=-1)
|
||||
return pairwise_weight
|
||||
|
||||
def dot_product(self, theta_x, phi_x):
|
||||
# NonLocal1d pairwise_weight: [N, H, H]
|
||||
# NonLocal2d pairwise_weight: [N, HxW, HxW]
|
||||
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
|
||||
pairwise_weight = torch.matmul(theta_x, phi_x)
|
||||
pairwise_weight /= pairwise_weight.shape[-1]
|
||||
return pairwise_weight
|
||||
|
||||
def concatenation(self, theta_x, phi_x):
|
||||
# NonLocal1d pairwise_weight: [N, H, H]
|
||||
# NonLocal2d pairwise_weight: [N, HxW, HxW]
|
||||
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
|
||||
h = theta_x.size(2)
|
||||
w = phi_x.size(3)
|
||||
theta_x = theta_x.repeat(1, 1, 1, w)
|
||||
phi_x = phi_x.repeat(1, 1, h, 1)
|
||||
|
||||
concat_feature = torch.cat([theta_x, phi_x], dim=1)
|
||||
pairwise_weight = self.concat_project(concat_feature)
|
||||
n, _, h, w = pairwise_weight.size()
|
||||
pairwise_weight = pairwise_weight.view(n, h, w)
|
||||
pairwise_weight /= pairwise_weight.shape[-1]
|
||||
|
||||
return pairwise_weight
|
||||
|
||||
def forward(self, x):
|
||||
# Assume `reduction = 1`, then `inter_channels = C`
|
||||
# or `inter_channels = C` when `mode="gaussian"`
|
||||
|
||||
# NonLocal1d x: [N, C, H]
|
||||
# NonLocal2d x: [N, C, H, W]
|
||||
# NonLocal3d x: [N, C, T, H, W]
|
||||
n = x.size(0)
|
||||
|
||||
# NonLocal1d g_x: [N, H, C]
|
||||
# NonLocal2d g_x: [N, HxW, C]
|
||||
# NonLocal3d g_x: [N, TxHxW, C]
|
||||
g_x = self.g(x).view(n, self.inter_channels, -1)
|
||||
g_x = g_x.permute(0, 2, 1)
|
||||
|
||||
# NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H]
|
||||
# NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW]
|
||||
# NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW]
|
||||
if self.mode == 'gaussian':
|
||||
theta_x = x.view(n, self.in_channels, -1)
|
||||
theta_x = theta_x.permute(0, 2, 1)
|
||||
if self.sub_sample:
|
||||
phi_x = self.phi(x).view(n, self.in_channels, -1)
|
||||
else:
|
||||
phi_x = x.view(n, self.in_channels, -1)
|
||||
elif self.mode == 'concatenation':
|
||||
theta_x = self.theta(x).view(n, self.inter_channels, -1, 1)
|
||||
phi_x = self.phi(x).view(n, self.inter_channels, 1, -1)
|
||||
else:
|
||||
theta_x = self.theta(x).view(n, self.inter_channels, -1)
|
||||
theta_x = theta_x.permute(0, 2, 1)
|
||||
phi_x = self.phi(x).view(n, self.inter_channels, -1)
|
||||
|
||||
pairwise_func = getattr(self, self.mode)
|
||||
# NonLocal1d pairwise_weight: [N, H, H]
|
||||
# NonLocal2d pairwise_weight: [N, HxW, HxW]
|
||||
# NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
|
||||
pairwise_weight = pairwise_func(theta_x, phi_x)
|
||||
|
||||
# NonLocal1d y: [N, H, C]
|
||||
# NonLocal2d y: [N, HxW, C]
|
||||
# NonLocal3d y: [N, TxHxW, C]
|
||||
y = torch.matmul(pairwise_weight, g_x)
|
||||
# NonLocal1d y: [N, C, H]
|
||||
# NonLocal2d y: [N, C, H, W]
|
||||
# NonLocal3d y: [N, C, T, H, W]
|
||||
y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels,
|
||||
*x.size()[2:])
|
||||
|
||||
output = x + self.conv_out(y)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class NonLocal1d(_NonLocalNd):
|
||||
"""1D Non-local module.
|
||||
|
||||
Args:
|
||||
in_channels (int): Same as `NonLocalND`.
|
||||
sub_sample (bool): Whether to apply max pooling after pairwise
|
||||
function (Note that the `sub_sample` is applied on spatial only).
|
||||
Default: False.
|
||||
conv_cfg (None | dict): Same as `NonLocalND`.
|
||||
Default: dict(type='Conv1d').
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
sub_sample=False,
|
||||
conv_cfg=dict(type='Conv1d'),
|
||||
**kwargs):
|
||||
super(NonLocal1d, self).__init__(
|
||||
in_channels, conv_cfg=conv_cfg, **kwargs)
|
||||
|
||||
self.sub_sample = sub_sample
|
||||
|
||||
if sub_sample:
|
||||
max_pool_layer = nn.MaxPool1d(kernel_size=2)
|
||||
self.g = nn.Sequential(self.g, max_pool_layer)
|
||||
if self.mode != 'gaussian':
|
||||
self.phi = nn.Sequential(self.phi, max_pool_layer)
|
||||
else:
|
||||
self.phi = max_pool_layer
|
||||
|
||||
|
||||
@PLUGIN_LAYERS.register_module()
|
||||
class NonLocal2d(_NonLocalNd):
|
||||
"""2D Non-local module.
|
||||
|
||||
Args:
|
||||
in_channels (int): Same as `NonLocalND`.
|
||||
sub_sample (bool): Whether to apply max pooling after pairwise
|
||||
function (Note that the `sub_sample` is applied on spatial only).
|
||||
Default: False.
|
||||
conv_cfg (None | dict): Same as `NonLocalND`.
|
||||
Default: dict(type='Conv2d').
|
||||
"""
|
||||
|
||||
_abbr_ = 'nonlocal_block'
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
sub_sample=False,
|
||||
conv_cfg=dict(type='Conv2d'),
|
||||
**kwargs):
|
||||
super(NonLocal2d, self).__init__(
|
||||
in_channels, conv_cfg=conv_cfg, **kwargs)
|
||||
|
||||
self.sub_sample = sub_sample
|
||||
|
||||
if sub_sample:
|
||||
max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
|
||||
self.g = nn.Sequential(self.g, max_pool_layer)
|
||||
if self.mode != 'gaussian':
|
||||
self.phi = nn.Sequential(self.phi, max_pool_layer)
|
||||
else:
|
||||
self.phi = max_pool_layer
|
||||
|
||||
|
||||
class NonLocal3d(_NonLocalNd):
|
||||
"""3D Non-local module.
|
||||
|
||||
Args:
|
||||
in_channels (int): Same as `NonLocalND`.
|
||||
sub_sample (bool): Whether to apply max pooling after pairwise
|
||||
function (Note that the `sub_sample` is applied on spatial only).
|
||||
Default: False.
|
||||
conv_cfg (None | dict): Same as `NonLocalND`.
|
||||
Default: dict(type='Conv3d').
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
sub_sample=False,
|
||||
conv_cfg=dict(type='Conv3d'),
|
||||
**kwargs):
|
||||
super(NonLocal3d, self).__init__(
|
||||
in_channels, conv_cfg=conv_cfg, **kwargs)
|
||||
self.sub_sample = sub_sample
|
||||
|
||||
if sub_sample:
|
||||
max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
|
||||
self.g = nn.Sequential(self.g, max_pool_layer)
|
||||
if self.mode != 'gaussian':
|
||||
self.phi = nn.Sequential(self.phi, max_pool_layer)
|
||||
else:
|
||||
self.phi = max_pool_layer
|
||||
144
comfy/annotator/uniformer/mmcv/cnn/bricks/norm.py
Normal file
144
comfy/annotator/uniformer/mmcv/cnn/bricks/norm.py
Normal file
@ -0,0 +1,144 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import inspect
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
from annotator.uniformer.mmcv.utils import is_tuple_of
|
||||
from annotator.uniformer.mmcv.utils.parrots_wrapper import SyncBatchNorm, _BatchNorm, _InstanceNorm
|
||||
from .registry import NORM_LAYERS
|
||||
|
||||
NORM_LAYERS.register_module('BN', module=nn.BatchNorm2d)
|
||||
NORM_LAYERS.register_module('BN1d', module=nn.BatchNorm1d)
|
||||
NORM_LAYERS.register_module('BN2d', module=nn.BatchNorm2d)
|
||||
NORM_LAYERS.register_module('BN3d', module=nn.BatchNorm3d)
|
||||
NORM_LAYERS.register_module('SyncBN', module=SyncBatchNorm)
|
||||
NORM_LAYERS.register_module('GN', module=nn.GroupNorm)
|
||||
NORM_LAYERS.register_module('LN', module=nn.LayerNorm)
|
||||
NORM_LAYERS.register_module('IN', module=nn.InstanceNorm2d)
|
||||
NORM_LAYERS.register_module('IN1d', module=nn.InstanceNorm1d)
|
||||
NORM_LAYERS.register_module('IN2d', module=nn.InstanceNorm2d)
|
||||
NORM_LAYERS.register_module('IN3d', module=nn.InstanceNorm3d)
|
||||
|
||||
|
||||
def infer_abbr(class_type):
|
||||
"""Infer abbreviation from the class name.
|
||||
|
||||
When we build a norm layer with `build_norm_layer()`, we want to preserve
|
||||
the norm type in variable names, e.g, self.bn1, self.gn. This method will
|
||||
infer the abbreviation to map class types to abbreviations.
|
||||
|
||||
Rule 1: If the class has the property "_abbr_", return the property.
|
||||
Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or
|
||||
InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and
|
||||
"in" respectively.
|
||||
Rule 3: If the class name contains "batch", "group", "layer" or "instance",
|
||||
the abbreviation of this layer will be "bn", "gn", "ln" and "in"
|
||||
respectively.
|
||||
Rule 4: Otherwise, the abbreviation falls back to "norm".
|
||||
|
||||
Args:
|
||||
class_type (type): The norm layer type.
|
||||
|
||||
Returns:
|
||||
str: The inferred abbreviation.
|
||||
"""
|
||||
if not inspect.isclass(class_type):
|
||||
raise TypeError(
|
||||
f'class_type must be a type, but got {type(class_type)}')
|
||||
if hasattr(class_type, '_abbr_'):
|
||||
return class_type._abbr_
|
||||
if issubclass(class_type, _InstanceNorm): # IN is a subclass of BN
|
||||
return 'in'
|
||||
elif issubclass(class_type, _BatchNorm):
|
||||
return 'bn'
|
||||
elif issubclass(class_type, nn.GroupNorm):
|
||||
return 'gn'
|
||||
elif issubclass(class_type, nn.LayerNorm):
|
||||
return 'ln'
|
||||
else:
|
||||
class_name = class_type.__name__.lower()
|
||||
if 'batch' in class_name:
|
||||
return 'bn'
|
||||
elif 'group' in class_name:
|
||||
return 'gn'
|
||||
elif 'layer' in class_name:
|
||||
return 'ln'
|
||||
elif 'instance' in class_name:
|
||||
return 'in'
|
||||
else:
|
||||
return 'norm_layer'
|
||||
|
||||
|
||||
def build_norm_layer(cfg, num_features, postfix=''):
|
||||
"""Build normalization layer.
|
||||
|
||||
Args:
|
||||
cfg (dict): The norm layer config, which should contain:
|
||||
|
||||
- type (str): Layer type.
|
||||
- layer args: Args needed to instantiate a norm layer.
|
||||
- requires_grad (bool, optional): Whether stop gradient updates.
|
||||
num_features (int): Number of input channels.
|
||||
postfix (int | str): The postfix to be appended into norm abbreviation
|
||||
to create named layer.
|
||||
|
||||
Returns:
|
||||
(str, nn.Module): The first element is the layer name consisting of
|
||||
abbreviation and postfix, e.g., bn1, gn. The second element is the
|
||||
created norm layer.
|
||||
"""
|
||||
if not isinstance(cfg, dict):
|
||||
raise TypeError('cfg must be a dict')
|
||||
if 'type' not in cfg:
|
||||
raise KeyError('the cfg dict must contain the key "type"')
|
||||
cfg_ = cfg.copy()
|
||||
|
||||
layer_type = cfg_.pop('type')
|
||||
if layer_type not in NORM_LAYERS:
|
||||
raise KeyError(f'Unrecognized norm type {layer_type}')
|
||||
|
||||
norm_layer = NORM_LAYERS.get(layer_type)
|
||||
abbr = infer_abbr(norm_layer)
|
||||
|
||||
assert isinstance(postfix, (int, str))
|
||||
name = abbr + str(postfix)
|
||||
|
||||
requires_grad = cfg_.pop('requires_grad', True)
|
||||
cfg_.setdefault('eps', 1e-5)
|
||||
if layer_type != 'GN':
|
||||
layer = norm_layer(num_features, **cfg_)
|
||||
if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
|
||||
layer._specify_ddp_gpu_num(1)
|
||||
else:
|
||||
assert 'num_groups' in cfg_
|
||||
layer = norm_layer(num_channels=num_features, **cfg_)
|
||||
|
||||
for param in layer.parameters():
|
||||
param.requires_grad = requires_grad
|
||||
|
||||
return name, layer
|
||||
|
||||
|
||||
def is_norm(layer, exclude=None):
|
||||
"""Check if a layer is a normalization layer.
|
||||
|
||||
Args:
|
||||
layer (nn.Module): The layer to be checked.
|
||||
exclude (type | tuple[type]): Types to be excluded.
|
||||
|
||||
Returns:
|
||||
bool: Whether the layer is a norm layer.
|
||||
"""
|
||||
if exclude is not None:
|
||||
if not isinstance(exclude, tuple):
|
||||
exclude = (exclude, )
|
||||
if not is_tuple_of(exclude, type):
|
||||
raise TypeError(
|
||||
f'"exclude" must be either None or type or a tuple of types, '
|
||||
f'but got {type(exclude)}: {exclude}')
|
||||
|
||||
if exclude and isinstance(layer, exclude):
|
||||
return False
|
||||
|
||||
all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm)
|
||||
return isinstance(layer, all_norm_bases)
|
||||
36
comfy/annotator/uniformer/mmcv/cnn/bricks/padding.py
Normal file
36
comfy/annotator/uniformer/mmcv/cnn/bricks/padding.py
Normal file
@ -0,0 +1,36 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch.nn as nn
|
||||
|
||||
from .registry import PADDING_LAYERS
|
||||
|
||||
PADDING_LAYERS.register_module('zero', module=nn.ZeroPad2d)
|
||||
PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d)
|
||||
PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d)
|
||||
|
||||
|
||||
def build_padding_layer(cfg, *args, **kwargs):
|
||||
"""Build padding layer.
|
||||
|
||||
Args:
|
||||
cfg (None or dict): The padding layer config, which should contain:
|
||||
- type (str): Layer type.
|
||||
- layer args: Args needed to instantiate a padding layer.
|
||||
|
||||
Returns:
|
||||
nn.Module: Created padding layer.
|
||||
"""
|
||||
if not isinstance(cfg, dict):
|
||||
raise TypeError('cfg must be a dict')
|
||||
if 'type' not in cfg:
|
||||
raise KeyError('the cfg dict must contain the key "type"')
|
||||
|
||||
cfg_ = cfg.copy()
|
||||
padding_type = cfg_.pop('type')
|
||||
if padding_type not in PADDING_LAYERS:
|
||||
raise KeyError(f'Unrecognized padding type {padding_type}.')
|
||||
else:
|
||||
padding_layer = PADDING_LAYERS.get(padding_type)
|
||||
|
||||
layer = padding_layer(*args, **kwargs, **cfg_)
|
||||
|
||||
return layer
|
||||
88
comfy/annotator/uniformer/mmcv/cnn/bricks/plugin.py
Normal file
88
comfy/annotator/uniformer/mmcv/cnn/bricks/plugin.py
Normal file
@ -0,0 +1,88 @@
|
||||
import inspect
|
||||
import platform
|
||||
|
||||
from .registry import PLUGIN_LAYERS
|
||||
|
||||
if platform.system() == 'Windows':
|
||||
import regex as re
|
||||
else:
|
||||
import re
|
||||
|
||||
|
||||
def infer_abbr(class_type):
|
||||
"""Infer abbreviation from the class name.
|
||||
|
||||
This method will infer the abbreviation to map class types to
|
||||
abbreviations.
|
||||
|
||||
Rule 1: If the class has the property "abbr", return the property.
|
||||
Rule 2: Otherwise, the abbreviation falls back to snake case of class
|
||||
name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``.
|
||||
|
||||
Args:
|
||||
class_type (type): The norm layer type.
|
||||
|
||||
Returns:
|
||||
str: The inferred abbreviation.
|
||||
"""
|
||||
|
||||
def camel2snack(word):
|
||||
"""Convert camel case word into snack case.
|
||||
|
||||
Modified from `inflection lib
|
||||
<https://inflection.readthedocs.io/en/latest/#inflection.underscore>`_.
|
||||
|
||||
Example::
|
||||
|
||||
>>> camel2snack("FancyBlock")
|
||||
'fancy_block'
|
||||
"""
|
||||
|
||||
word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word)
|
||||
word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word)
|
||||
word = word.replace('-', '_')
|
||||
return word.lower()
|
||||
|
||||
if not inspect.isclass(class_type):
|
||||
raise TypeError(
|
||||
f'class_type must be a type, but got {type(class_type)}')
|
||||
if hasattr(class_type, '_abbr_'):
|
||||
return class_type._abbr_
|
||||
else:
|
||||
return camel2snack(class_type.__name__)
|
||||
|
||||
|
||||
def build_plugin_layer(cfg, postfix='', **kwargs):
|
||||
"""Build plugin layer.
|
||||
|
||||
Args:
|
||||
cfg (None or dict): cfg should contain:
|
||||
type (str): identify plugin layer type.
|
||||
layer args: args needed to instantiate a plugin layer.
|
||||
postfix (int, str): appended into norm abbreviation to
|
||||
create named layer. Default: ''.
|
||||
|
||||
Returns:
|
||||
tuple[str, nn.Module]:
|
||||
name (str): abbreviation + postfix
|
||||
layer (nn.Module): created plugin layer
|
||||
"""
|
||||
if not isinstance(cfg, dict):
|
||||
raise TypeError('cfg must be a dict')
|
||||
if 'type' not in cfg:
|
||||
raise KeyError('the cfg dict must contain the key "type"')
|
||||
cfg_ = cfg.copy()
|
||||
|
||||
layer_type = cfg_.pop('type')
|
||||
if layer_type not in PLUGIN_LAYERS:
|
||||
raise KeyError(f'Unrecognized plugin type {layer_type}')
|
||||
|
||||
plugin_layer = PLUGIN_LAYERS.get(layer_type)
|
||||
abbr = infer_abbr(plugin_layer)
|
||||
|
||||
assert isinstance(postfix, (int, str))
|
||||
name = abbr + str(postfix)
|
||||
|
||||
layer = plugin_layer(**kwargs, **cfg_)
|
||||
|
||||
return name, layer
|
||||
16
comfy/annotator/uniformer/mmcv/cnn/bricks/registry.py
Normal file
16
comfy/annotator/uniformer/mmcv/cnn/bricks/registry.py
Normal file
@ -0,0 +1,16 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
from annotator.uniformer.mmcv.utils import Registry
|
||||
|
||||
CONV_LAYERS = Registry('conv layer')
|
||||
NORM_LAYERS = Registry('norm layer')
|
||||
ACTIVATION_LAYERS = Registry('activation layer')
|
||||
PADDING_LAYERS = Registry('padding layer')
|
||||
UPSAMPLE_LAYERS = Registry('upsample layer')
|
||||
PLUGIN_LAYERS = Registry('plugin layer')
|
||||
|
||||
DROPOUT_LAYERS = Registry('drop out layers')
|
||||
POSITIONAL_ENCODING = Registry('position encoding')
|
||||
ATTENTION = Registry('attention')
|
||||
FEEDFORWARD_NETWORK = Registry('feed-forward Network')
|
||||
TRANSFORMER_LAYER = Registry('transformerLayer')
|
||||
TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence')
|
||||
21
comfy/annotator/uniformer/mmcv/cnn/bricks/scale.py
Normal file
21
comfy/annotator/uniformer/mmcv/cnn/bricks/scale.py
Normal file
@ -0,0 +1,21 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class Scale(nn.Module):
|
||||
"""A learnable scale parameter.
|
||||
|
||||
This layer scales the input by a learnable factor. It multiplies a
|
||||
learnable scale parameter of shape (1,) with input of any shape.
|
||||
|
||||
Args:
|
||||
scale (float): Initial value of scale factor. Default: 1.0
|
||||
"""
|
||||
|
||||
def __init__(self, scale=1.0):
|
||||
super(Scale, self).__init__()
|
||||
self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
|
||||
|
||||
def forward(self, x):
|
||||
return x * self.scale
|
||||
25
comfy/annotator/uniformer/mmcv/cnn/bricks/swish.py
Normal file
25
comfy/annotator/uniformer/mmcv/cnn/bricks/swish.py
Normal file
@ -0,0 +1,25 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from .registry import ACTIVATION_LAYERS
|
||||
|
||||
|
||||
@ACTIVATION_LAYERS.register_module()
|
||||
class Swish(nn.Module):
|
||||
"""Swish Module.
|
||||
|
||||
This module applies the swish function:
|
||||
|
||||
.. math::
|
||||
Swish(x) = x * Sigmoid(x)
|
||||
|
||||
Returns:
|
||||
Tensor: The output tensor.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super(Swish, self).__init__()
|
||||
|
||||
def forward(self, x):
|
||||
return x * torch.sigmoid(x)
|
||||
595
comfy/annotator/uniformer/mmcv/cnn/bricks/transformer.py
Normal file
595
comfy/annotator/uniformer/mmcv/cnn/bricks/transformer.py
Normal file
@ -0,0 +1,595 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import copy
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from annotator.uniformer.mmcv import ConfigDict, deprecated_api_warning
|
||||
from annotator.uniformer.mmcv.cnn import Linear, build_activation_layer, build_norm_layer
|
||||
from annotator.uniformer.mmcv.runner.base_module import BaseModule, ModuleList, Sequential
|
||||
from annotator.uniformer.mmcv.utils import build_from_cfg
|
||||
from .drop import build_dropout
|
||||
from .registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING,
|
||||
TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE)
|
||||
|
||||
# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
|
||||
try:
|
||||
from annotator.uniformer.mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention # noqa F401
|
||||
warnings.warn(
|
||||
ImportWarning(
|
||||
'``MultiScaleDeformableAttention`` has been moved to '
|
||||
'``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501
|
||||
'``from annotator.uniformer.mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501
|
||||
'to ``from annotator.uniformer.mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501
|
||||
))
|
||||
|
||||
except ImportError:
|
||||
warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
|
||||
'``mmcv.ops.multi_scale_deform_attn``, '
|
||||
'You should install ``mmcv-full`` if you need this module. ')
|
||||
|
||||
|
||||
def build_positional_encoding(cfg, default_args=None):
|
||||
"""Builder for Position Encoding."""
|
||||
return build_from_cfg(cfg, POSITIONAL_ENCODING, default_args)
|
||||
|
||||
|
||||
def build_attention(cfg, default_args=None):
|
||||
"""Builder for attention."""
|
||||
return build_from_cfg(cfg, ATTENTION, default_args)
|
||||
|
||||
|
||||
def build_feedforward_network(cfg, default_args=None):
|
||||
"""Builder for feed-forward network (FFN)."""
|
||||
return build_from_cfg(cfg, FEEDFORWARD_NETWORK, default_args)
|
||||
|
||||
|
||||
def build_transformer_layer(cfg, default_args=None):
|
||||
"""Builder for transformer layer."""
|
||||
return build_from_cfg(cfg, TRANSFORMER_LAYER, default_args)
|
||||
|
||||
|
||||
def build_transformer_layer_sequence(cfg, default_args=None):
|
||||
"""Builder for transformer encoder and transformer decoder."""
|
||||
return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args)
|
||||
|
||||
|
||||
@ATTENTION.register_module()
|
||||
class MultiheadAttention(BaseModule):
|
||||
"""A wrapper for ``torch.nn.MultiheadAttention``.
|
||||
|
||||
This module implements MultiheadAttention with identity connection,
|
||||
and positional encoding is also passed as input.
|
||||
|
||||
Args:
|
||||
embed_dims (int): The embedding dimension.
|
||||
num_heads (int): Parallel attention heads.
|
||||
attn_drop (float): A Dropout layer on attn_output_weights.
|
||||
Default: 0.0.
|
||||
proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
|
||||
Default: 0.0.
|
||||
dropout_layer (obj:`ConfigDict`): The dropout_layer used
|
||||
when adding the shortcut.
|
||||
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
|
||||
Default: None.
|
||||
batch_first (bool): When it is True, Key, Query and Value are shape of
|
||||
(batch, n, embed_dim), otherwise (n, batch, embed_dim).
|
||||
Default to False.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
embed_dims,
|
||||
num_heads,
|
||||
attn_drop=0.,
|
||||
proj_drop=0.,
|
||||
dropout_layer=dict(type='Dropout', drop_prob=0.),
|
||||
init_cfg=None,
|
||||
batch_first=False,
|
||||
**kwargs):
|
||||
super(MultiheadAttention, self).__init__(init_cfg)
|
||||
if 'dropout' in kwargs:
|
||||
warnings.warn('The arguments `dropout` in MultiheadAttention '
|
||||
'has been deprecated, now you can separately '
|
||||
'set `attn_drop`(float), proj_drop(float), '
|
||||
'and `dropout_layer`(dict) ')
|
||||
attn_drop = kwargs['dropout']
|
||||
dropout_layer['drop_prob'] = kwargs.pop('dropout')
|
||||
|
||||
self.embed_dims = embed_dims
|
||||
self.num_heads = num_heads
|
||||
self.batch_first = batch_first
|
||||
|
||||
self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
|
||||
**kwargs)
|
||||
|
||||
self.proj_drop = nn.Dropout(proj_drop)
|
||||
self.dropout_layer = build_dropout(
|
||||
dropout_layer) if dropout_layer else nn.Identity()
|
||||
|
||||
@deprecated_api_warning({'residual': 'identity'},
|
||||
cls_name='MultiheadAttention')
|
||||
def forward(self,
|
||||
query,
|
||||
key=None,
|
||||
value=None,
|
||||
identity=None,
|
||||
query_pos=None,
|
||||
key_pos=None,
|
||||
attn_mask=None,
|
||||
key_padding_mask=None,
|
||||
**kwargs):
|
||||
"""Forward function for `MultiheadAttention`.
|
||||
|
||||
**kwargs allow passing a more general data flow when combining
|
||||
with other operations in `transformerlayer`.
|
||||
|
||||
Args:
|
||||
query (Tensor): The input query with shape [num_queries, bs,
|
||||
embed_dims] if self.batch_first is False, else
|
||||
[bs, num_queries embed_dims].
|
||||
key (Tensor): The key tensor with shape [num_keys, bs,
|
||||
embed_dims] if self.batch_first is False, else
|
||||
[bs, num_keys, embed_dims] .
|
||||
If None, the ``query`` will be used. Defaults to None.
|
||||
value (Tensor): The value tensor with same shape as `key`.
|
||||
Same in `nn.MultiheadAttention.forward`. Defaults to None.
|
||||
If None, the `key` will be used.
|
||||
identity (Tensor): This tensor, with the same shape as x,
|
||||
will be used for the identity link.
|
||||
If None, `x` will be used. Defaults to None.
|
||||
query_pos (Tensor): The positional encoding for query, with
|
||||
the same shape as `x`. If not None, it will
|
||||
be added to `x` before forward function. Defaults to None.
|
||||
key_pos (Tensor): The positional encoding for `key`, with the
|
||||
same shape as `key`. Defaults to None. If not None, it will
|
||||
be added to `key` before forward function. If None, and
|
||||
`query_pos` has the same shape as `key`, then `query_pos`
|
||||
will be used for `key_pos`. Defaults to None.
|
||||
attn_mask (Tensor): ByteTensor mask with shape [num_queries,
|
||||
num_keys]. Same in `nn.MultiheadAttention.forward`.
|
||||
Defaults to None.
|
||||
key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
|
||||
Defaults to None.
|
||||
|
||||
Returns:
|
||||
Tensor: forwarded results with shape
|
||||
[num_queries, bs, embed_dims]
|
||||
if self.batch_first is False, else
|
||||
[bs, num_queries embed_dims].
|
||||
"""
|
||||
|
||||
if key is None:
|
||||
key = query
|
||||
if value is None:
|
||||
value = key
|
||||
if identity is None:
|
||||
identity = query
|
||||
if key_pos is None:
|
||||
if query_pos is not None:
|
||||
# use query_pos if key_pos is not available
|
||||
if query_pos.shape == key.shape:
|
||||
key_pos = query_pos
|
||||
else:
|
||||
warnings.warn(f'position encoding of key is'
|
||||
f'missing in {self.__class__.__name__}.')
|
||||
if query_pos is not None:
|
||||
query = query + query_pos
|
||||
if key_pos is not None:
|
||||
key = key + key_pos
|
||||
|
||||
# Because the dataflow('key', 'query', 'value') of
|
||||
# ``torch.nn.MultiheadAttention`` is (num_query, batch,
|
||||
# embed_dims), We should adjust the shape of dataflow from
|
||||
# batch_first (batch, num_query, embed_dims) to num_query_first
|
||||
# (num_query ,batch, embed_dims), and recover ``attn_output``
|
||||
# from num_query_first to batch_first.
|
||||
if self.batch_first:
|
||||
query = query.transpose(0, 1)
|
||||
key = key.transpose(0, 1)
|
||||
value = value.transpose(0, 1)
|
||||
|
||||
out = self.attn(
|
||||
query=query,
|
||||
key=key,
|
||||
value=value,
|
||||
attn_mask=attn_mask,
|
||||
key_padding_mask=key_padding_mask)[0]
|
||||
|
||||
if self.batch_first:
|
||||
out = out.transpose(0, 1)
|
||||
|
||||
return identity + self.dropout_layer(self.proj_drop(out))
|
||||
|
||||
|
||||
@FEEDFORWARD_NETWORK.register_module()
|
||||
class FFN(BaseModule):
|
||||
"""Implements feed-forward networks (FFNs) with identity connection.
|
||||
|
||||
Args:
|
||||
embed_dims (int): The feature dimension. Same as
|
||||
`MultiheadAttention`. Defaults: 256.
|
||||
feedforward_channels (int): The hidden dimension of FFNs.
|
||||
Defaults: 1024.
|
||||
num_fcs (int, optional): The number of fully-connected layers in
|
||||
FFNs. Default: 2.
|
||||
act_cfg (dict, optional): The activation config for FFNs.
|
||||
Default: dict(type='ReLU')
|
||||
ffn_drop (float, optional): Probability of an element to be
|
||||
zeroed in FFN. Default 0.0.
|
||||
add_identity (bool, optional): Whether to add the
|
||||
identity connection. Default: `True`.
|
||||
dropout_layer (obj:`ConfigDict`): The dropout_layer used
|
||||
when adding the shortcut.
|
||||
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
|
||||
Default: None.
|
||||
"""
|
||||
|
||||
@deprecated_api_warning(
|
||||
{
|
||||
'dropout': 'ffn_drop',
|
||||
'add_residual': 'add_identity'
|
||||
},
|
||||
cls_name='FFN')
|
||||
def __init__(self,
|
||||
embed_dims=256,
|
||||
feedforward_channels=1024,
|
||||
num_fcs=2,
|
||||
act_cfg=dict(type='ReLU', inplace=True),
|
||||
ffn_drop=0.,
|
||||
dropout_layer=None,
|
||||
add_identity=True,
|
||||
init_cfg=None,
|
||||
**kwargs):
|
||||
super(FFN, self).__init__(init_cfg)
|
||||
assert num_fcs >= 2, 'num_fcs should be no less ' \
|
||||
f'than 2. got {num_fcs}.'
|
||||
self.embed_dims = embed_dims
|
||||
self.feedforward_channels = feedforward_channels
|
||||
self.num_fcs = num_fcs
|
||||
self.act_cfg = act_cfg
|
||||
self.activate = build_activation_layer(act_cfg)
|
||||
|
||||
layers = []
|
||||
in_channels = embed_dims
|
||||
for _ in range(num_fcs - 1):
|
||||
layers.append(
|
||||
Sequential(
|
||||
Linear(in_channels, feedforward_channels), self.activate,
|
||||
nn.Dropout(ffn_drop)))
|
||||
in_channels = feedforward_channels
|
||||
layers.append(Linear(feedforward_channels, embed_dims))
|
||||
layers.append(nn.Dropout(ffn_drop))
|
||||
self.layers = Sequential(*layers)
|
||||
self.dropout_layer = build_dropout(
|
||||
dropout_layer) if dropout_layer else torch.nn.Identity()
|
||||
self.add_identity = add_identity
|
||||
|
||||
@deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
|
||||
def forward(self, x, identity=None):
|
||||
"""Forward function for `FFN`.
|
||||
|
||||
The function would add x to the output tensor if residue is None.
|
||||
"""
|
||||
out = self.layers(x)
|
||||
if not self.add_identity:
|
||||
return self.dropout_layer(out)
|
||||
if identity is None:
|
||||
identity = x
|
||||
return identity + self.dropout_layer(out)
|
||||
|
||||
|
||||
@TRANSFORMER_LAYER.register_module()
|
||||
class BaseTransformerLayer(BaseModule):
|
||||
"""Base `TransformerLayer` for vision transformer.
|
||||
|
||||
It can be built from `mmcv.ConfigDict` and support more flexible
|
||||
customization, for example, using any number of `FFN or LN ` and
|
||||
use different kinds of `attention` by specifying a list of `ConfigDict`
|
||||
named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
|
||||
when you specifying `norm` as the first element of `operation_order`.
|
||||
More details about the `prenorm`: `On Layer Normalization in the
|
||||
Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
|
||||
|
||||
Args:
|
||||
attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
|
||||
Configs for `self_attention` or `cross_attention` modules,
|
||||
The order of the configs in the list should be consistent with
|
||||
corresponding attentions in operation_order.
|
||||
If it is a dict, all of the attention modules in operation_order
|
||||
will be built with this config. Default: None.
|
||||
ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
|
||||
Configs for FFN, The order of the configs in the list should be
|
||||
consistent with corresponding ffn in operation_order.
|
||||
If it is a dict, all of the attention modules in operation_order
|
||||
will be built with this config.
|
||||
operation_order (tuple[str]): The execution order of operation
|
||||
in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
|
||||
Support `prenorm` when you specifying first element as `norm`.
|
||||
Default:None.
|
||||
norm_cfg (dict): Config dict for normalization layer.
|
||||
Default: dict(type='LN').
|
||||
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
|
||||
Default: None.
|
||||
batch_first (bool): Key, Query and Value are shape
|
||||
of (batch, n, embed_dim)
|
||||
or (n, batch, embed_dim). Default to False.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
attn_cfgs=None,
|
||||
ffn_cfgs=dict(
|
||||
type='FFN',
|
||||
embed_dims=256,
|
||||
feedforward_channels=1024,
|
||||
num_fcs=2,
|
||||
ffn_drop=0.,
|
||||
act_cfg=dict(type='ReLU', inplace=True),
|
||||
),
|
||||
operation_order=None,
|
||||
norm_cfg=dict(type='LN'),
|
||||
init_cfg=None,
|
||||
batch_first=False,
|
||||
**kwargs):
|
||||
|
||||
deprecated_args = dict(
|
||||
feedforward_channels='feedforward_channels',
|
||||
ffn_dropout='ffn_drop',
|
||||
ffn_num_fcs='num_fcs')
|
||||
for ori_name, new_name in deprecated_args.items():
|
||||
if ori_name in kwargs:
|
||||
warnings.warn(
|
||||
f'The arguments `{ori_name}` in BaseTransformerLayer '
|
||||
f'has been deprecated, now you should set `{new_name}` '
|
||||
f'and other FFN related arguments '
|
||||
f'to a dict named `ffn_cfgs`. ')
|
||||
ffn_cfgs[new_name] = kwargs[ori_name]
|
||||
|
||||
super(BaseTransformerLayer, self).__init__(init_cfg)
|
||||
|
||||
self.batch_first = batch_first
|
||||
|
||||
assert set(operation_order) & set(
|
||||
['self_attn', 'norm', 'ffn', 'cross_attn']) == \
|
||||
set(operation_order), f'The operation_order of' \
|
||||
f' {self.__class__.__name__} should ' \
|
||||
f'contains all four operation type ' \
|
||||
f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"
|
||||
|
||||
num_attn = operation_order.count('self_attn') + operation_order.count(
|
||||
'cross_attn')
|
||||
if isinstance(attn_cfgs, dict):
|
||||
attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
|
||||
else:
|
||||
assert num_attn == len(attn_cfgs), f'The length ' \
|
||||
f'of attn_cfg {num_attn} is ' \
|
||||
f'not consistent with the number of attention' \
|
||||
f'in operation_order {operation_order}.'
|
||||
|
||||
self.num_attn = num_attn
|
||||
self.operation_order = operation_order
|
||||
self.norm_cfg = norm_cfg
|
||||
self.pre_norm = operation_order[0] == 'norm'
|
||||
self.attentions = ModuleList()
|
||||
|
||||
index = 0
|
||||
for operation_name in operation_order:
|
||||
if operation_name in ['self_attn', 'cross_attn']:
|
||||
if 'batch_first' in attn_cfgs[index]:
|
||||
assert self.batch_first == attn_cfgs[index]['batch_first']
|
||||
else:
|
||||
attn_cfgs[index]['batch_first'] = self.batch_first
|
||||
attention = build_attention(attn_cfgs[index])
|
||||
# Some custom attentions used as `self_attn`
|
||||
# or `cross_attn` can have different behavior.
|
||||
attention.operation_name = operation_name
|
||||
self.attentions.append(attention)
|
||||
index += 1
|
||||
|
||||
self.embed_dims = self.attentions[0].embed_dims
|
||||
|
||||
self.ffns = ModuleList()
|
||||
num_ffns = operation_order.count('ffn')
|
||||
if isinstance(ffn_cfgs, dict):
|
||||
ffn_cfgs = ConfigDict(ffn_cfgs)
|
||||
if isinstance(ffn_cfgs, dict):
|
||||
ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
|
||||
assert len(ffn_cfgs) == num_ffns
|
||||
for ffn_index in range(num_ffns):
|
||||
if 'embed_dims' not in ffn_cfgs[ffn_index]:
|
||||
ffn_cfgs['embed_dims'] = self.embed_dims
|
||||
else:
|
||||
assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
|
||||
self.ffns.append(
|
||||
build_feedforward_network(ffn_cfgs[ffn_index],
|
||||
dict(type='FFN')))
|
||||
|
||||
self.norms = ModuleList()
|
||||
num_norms = operation_order.count('norm')
|
||||
for _ in range(num_norms):
|
||||
self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
|
||||
|
||||
def forward(self,
|
||||
query,
|
||||
key=None,
|
||||
value=None,
|
||||
query_pos=None,
|
||||
key_pos=None,
|
||||
attn_masks=None,
|
||||
query_key_padding_mask=None,
|
||||
key_padding_mask=None,
|
||||
**kwargs):
|
||||
"""Forward function for `TransformerDecoderLayer`.
|
||||
|
||||
**kwargs contains some specific arguments of attentions.
|
||||
|
||||
Args:
|
||||
query (Tensor): The input query with shape
|
||||
[num_queries, bs, embed_dims] if
|
||||
self.batch_first is False, else
|
||||
[bs, num_queries embed_dims].
|
||||
key (Tensor): The key tensor with shape [num_keys, bs,
|
||||
embed_dims] if self.batch_first is False, else
|
||||
[bs, num_keys, embed_dims] .
|
||||
value (Tensor): The value tensor with same shape as `key`.
|
||||
query_pos (Tensor): The positional encoding for `query`.
|
||||
Default: None.
|
||||
key_pos (Tensor): The positional encoding for `key`.
|
||||
Default: None.
|
||||
attn_masks (List[Tensor] | None): 2D Tensor used in
|
||||
calculation of corresponding attention. The length of
|
||||
it should equal to the number of `attention` in
|
||||
`operation_order`. Default: None.
|
||||
query_key_padding_mask (Tensor): ByteTensor for `query`, with
|
||||
shape [bs, num_queries]. Only used in `self_attn` layer.
|
||||
Defaults to None.
|
||||
key_padding_mask (Tensor): ByteTensor for `query`, with
|
||||
shape [bs, num_keys]. Default: None.
|
||||
|
||||
Returns:
|
||||
Tensor: forwarded results with shape [num_queries, bs, embed_dims].
|
||||
"""
|
||||
|
||||
norm_index = 0
|
||||
attn_index = 0
|
||||
ffn_index = 0
|
||||
identity = query
|
||||
if attn_masks is None:
|
||||
attn_masks = [None for _ in range(self.num_attn)]
|
||||
elif isinstance(attn_masks, torch.Tensor):
|
||||
attn_masks = [
|
||||
copy.deepcopy(attn_masks) for _ in range(self.num_attn)
|
||||
]
|
||||
warnings.warn(f'Use same attn_mask in all attentions in '
|
||||
f'{self.__class__.__name__} ')
|
||||
else:
|
||||
assert len(attn_masks) == self.num_attn, f'The length of ' \
|
||||
f'attn_masks {len(attn_masks)} must be equal ' \
|
||||
f'to the number of attention in ' \
|
||||
f'operation_order {self.num_attn}'
|
||||
|
||||
for layer in self.operation_order:
|
||||
if layer == 'self_attn':
|
||||
temp_key = temp_value = query
|
||||
query = self.attentions[attn_index](
|
||||
query,
|
||||
temp_key,
|
||||
temp_value,
|
||||
identity if self.pre_norm else None,
|
||||
query_pos=query_pos,
|
||||
key_pos=query_pos,
|
||||
attn_mask=attn_masks[attn_index],
|
||||
key_padding_mask=query_key_padding_mask,
|
||||
**kwargs)
|
||||
attn_index += 1
|
||||
identity = query
|
||||
|
||||
elif layer == 'norm':
|
||||
query = self.norms[norm_index](query)
|
||||
norm_index += 1
|
||||
|
||||
elif layer == 'cross_attn':
|
||||
query = self.attentions[attn_index](
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
identity if self.pre_norm else None,
|
||||
query_pos=query_pos,
|
||||
key_pos=key_pos,
|
||||
attn_mask=attn_masks[attn_index],
|
||||
key_padding_mask=key_padding_mask,
|
||||
**kwargs)
|
||||
attn_index += 1
|
||||
identity = query
|
||||
|
||||
elif layer == 'ffn':
|
||||
query = self.ffns[ffn_index](
|
||||
query, identity if self.pre_norm else None)
|
||||
ffn_index += 1
|
||||
|
||||
return query
|
||||
|
||||
|
||||
@TRANSFORMER_LAYER_SEQUENCE.register_module()
|
||||
class TransformerLayerSequence(BaseModule):
|
||||
"""Base class for TransformerEncoder and TransformerDecoder in vision
|
||||
transformer.
|
||||
|
||||
As base-class of Encoder and Decoder in vision transformer.
|
||||
Support customization such as specifying different kind
|
||||
of `transformer_layer` in `transformer_coder`.
|
||||
|
||||
Args:
|
||||
transformerlayer (list[obj:`mmcv.ConfigDict`] |
|
||||
obj:`mmcv.ConfigDict`): Config of transformerlayer
|
||||
in TransformerCoder. If it is obj:`mmcv.ConfigDict`,
|
||||
it would be repeated `num_layer` times to a
|
||||
list[`mmcv.ConfigDict`]. Default: None.
|
||||
num_layers (int): The number of `TransformerLayer`. Default: None.
|
||||
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
|
||||
Default: None.
|
||||
"""
|
||||
|
||||
def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):
|
||||
super(TransformerLayerSequence, self).__init__(init_cfg)
|
||||
if isinstance(transformerlayers, dict):
|
||||
transformerlayers = [
|
||||
copy.deepcopy(transformerlayers) for _ in range(num_layers)
|
||||
]
|
||||
else:
|
||||
assert isinstance(transformerlayers, list) and \
|
||||
len(transformerlayers) == num_layers
|
||||
self.num_layers = num_layers
|
||||
self.layers = ModuleList()
|
||||
for i in range(num_layers):
|
||||
self.layers.append(build_transformer_layer(transformerlayers[i]))
|
||||
self.embed_dims = self.layers[0].embed_dims
|
||||
self.pre_norm = self.layers[0].pre_norm
|
||||
|
||||
def forward(self,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
query_pos=None,
|
||||
key_pos=None,
|
||||
attn_masks=None,
|
||||
query_key_padding_mask=None,
|
||||
key_padding_mask=None,
|
||||
**kwargs):
|
||||
"""Forward function for `TransformerCoder`.
|
||||
|
||||
Args:
|
||||
query (Tensor): Input query with shape
|
||||
`(num_queries, bs, embed_dims)`.
|
||||
key (Tensor): The key tensor with shape
|
||||
`(num_keys, bs, embed_dims)`.
|
||||
value (Tensor): The value tensor with shape
|
||||
`(num_keys, bs, embed_dims)`.
|
||||
query_pos (Tensor): The positional encoding for `query`.
|
||||
Default: None.
|
||||
key_pos (Tensor): The positional encoding for `key`.
|
||||
Default: None.
|
||||
attn_masks (List[Tensor], optional): Each element is 2D Tensor
|
||||
which is used in calculation of corresponding attention in
|
||||
operation_order. Default: None.
|
||||
query_key_padding_mask (Tensor): ByteTensor for `query`, with
|
||||
shape [bs, num_queries]. Only used in self-attention
|
||||
Default: None.
|
||||
key_padding_mask (Tensor): ByteTensor for `query`, with
|
||||
shape [bs, num_keys]. Default: None.
|
||||
|
||||
Returns:
|
||||
Tensor: results with shape [num_queries, bs, embed_dims].
|
||||
"""
|
||||
for layer in self.layers:
|
||||
query = layer(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
query_pos=query_pos,
|
||||
key_pos=key_pos,
|
||||
attn_masks=attn_masks,
|
||||
query_key_padding_mask=query_key_padding_mask,
|
||||
key_padding_mask=key_padding_mask,
|
||||
**kwargs)
|
||||
return query
|
||||
84
comfy/annotator/uniformer/mmcv/cnn/bricks/upsample.py
Normal file
84
comfy/annotator/uniformer/mmcv/cnn/bricks/upsample.py
Normal file
@ -0,0 +1,84 @@
|
||||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ..utils import xavier_init
|
||||
from .registry import UPSAMPLE_LAYERS
|
||||
|
||||
UPSAMPLE_LAYERS.register_module('nearest', module=nn.Upsample)
|
||||
UPSAMPLE_LAYERS.register_module('bilinear', module=nn.Upsample)
|
||||
|
||||
|
||||
@UPSAMPLE_LAYERS.register_module(name='pixel_shuffle')
|
||||
class PixelShufflePack(nn.Module):
|
||||
"""Pixel Shuffle upsample layer.
|
||||
|
||||
This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to
|
||||
achieve a simple upsampling with pixel shuffle.
|
||||
|
||||
Args:
|
||||
in_channels (int): Number of input channels.
|
||||
out_channels (int): Number of output channels.
|
||||
scale_factor (int): Upsample ratio.
|
||||
upsample_kernel (int): Kernel size of the conv layer to expand the
|
||||
channels.
|
||||
"""
|
||||
|
||||
def __init__(self, in_channels, out_channels, scale_factor,
|
||||
upsample_kernel):
|
||||
super(PixelShufflePack, self).__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.scale_factor = scale_factor
|
||||
self.upsample_kernel = upsample_kernel
|
||||
self.upsample_conv = nn.Conv2d(
|
||||
self.in_channels,
|
||||
self.out_channels * scale_factor * scale_factor,
|
||||
self.upsample_kernel,
|
||||
padding=(self.upsample_kernel - 1) // 2)
|
||||
self.init_weights()
|
||||
|
||||
def init_weights(self):
|
||||
xavier_init(self.upsample_conv, distribution='uniform')
|
||||
|
||||
def forward(self, x):
|
||||
x = self.upsample_conv(x)
|
||||
x = F.pixel_shuffle(x, self.scale_factor)
|
||||
return x
|
||||
|
||||
|
||||
def build_upsample_layer(cfg, *args, **kwargs):
|
||||
"""Build upsample layer.
|
||||
|
||||
Args:
|
||||
cfg (dict): The upsample layer config, which should contain:
|
||||
|
||||
- type (str): Layer type.
|
||||
- scale_factor (int): Upsample ratio, which is not applicable to
|
||||
deconv.
|
||||
- layer args: Args needed to instantiate a upsample layer.
|
||||
args (argument list): Arguments passed to the ``__init__``
|
||||
method of the corresponding conv layer.
|
||||
kwargs (keyword arguments): Keyword arguments passed to the
|
||||
``__init__`` method of the corresponding conv layer.
|
||||
|
||||
Returns:
|
||||
nn.Module: Created upsample layer.
|
||||
"""
|
||||
if not isinstance(cfg, dict):
|
||||
raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
|
||||
if 'type' not in cfg:
|
||||
raise KeyError(
|
||||
f'the cfg dict must contain the key "type", but got {cfg}')
|
||||
cfg_ = cfg.copy()
|
||||
|
||||
layer_type = cfg_.pop('type')
|
||||
if layer_type not in UPSAMPLE_LAYERS:
|
||||
raise KeyError(f'Unrecognized upsample type {layer_type}')
|
||||
else:
|
||||
upsample = UPSAMPLE_LAYERS.get(layer_type)
|
||||
|
||||
if upsample is nn.Upsample:
|
||||
cfg_['mode'] = layer_type
|
||||
layer = upsample(*args, **kwargs, **cfg_)
|
||||
return layer
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user