YOLOv4:优化目标检测的速度和精度.
1. YOLOv4的网络结构
YOLOV4继承了YOLOv3的单阶段检测器主体结构,主要结构包括:
- backbone:CSPDarknet53(YOLOV3是Darknet53)
- neck:SPP+PANet,提取三种尺寸的特征映射
- head:同YOLOV3
当输入图像尺寸是$416×416$时,网络结构如下:
(1)backbone:CSPDarknet53
YOLOV4的骨干网络沿用了Darknet53中定义的resblock_body模块,其由一次下采样和多个残差块的堆叠构成(下图左);在此基础上引入了CSPNet (Cross Stage Partial Network),主干部分继续进行原来的残差块的堆叠,另一部分则引入了一个全局的残差连接(下图右)。修改后的结构称为CSPDarknet53。
其中激活函数使用Mish,表达式如下:
\[Mish(x) = x·\tanh(\log(1+e^x))\]# MISH激活函数
class Mish(nn.Module):
def __init__(self):
super(Mish, self).__init__()
def forward(self, x):
return x * torch.tanh(F.softplus(x))
# Conv2d + BatchNormalization + Mish
class BasicConv(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1):
super(BasicConv, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, kernel_size//2, bias=False)
self.bn = nn.BatchNorm2d(out_channels)
self.activation = Mish()
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.activation(x)
return x
# CSPdarknet内部堆叠的残差块
class Resblock(nn.Module):
def __init__(self, channels, hidden_channels=None):
super(Resblock, self).__init__()
if hidden_channels is None:
hidden_channels = channels
self.block = nn.Sequential(
BasicConv(channels, hidden_channels, 1),
BasicConv(hidden_channels, channels, 3)
)
def forward(self, x):
return x + self.block(x)
# CSPdarknet的结构块
class Resblock_body(nn.Module):
def __init__(self, in_channels, out_channels, num_blocks, first):
super(Resblock_body, self).__init__()
# 利用一个步长为2x2的卷积块进行高和宽的压缩
self.downsample_conv = BasicConv(in_channels, out_channels, 3, stride=2)
if first:
# 然后建立一个大的残差边self.split_conv0、这个大残差边绕过了很多的残差结构
self.split_conv0 = BasicConv(out_channels, out_channels, 1)
# 主干部分会对num_blocks进行循环,循环内部是残差结构
self.split_conv1 = BasicConv(out_channels, out_channels, 1)
self.blocks_conv = nn.Sequential(
Resblock(channels=out_channels, hidden_channels=out_channels//2),
BasicConv(out_channels, out_channels, 1)
)
self.concat_conv = BasicConv(out_channels*2, out_channels, 1)
else:
self.split_conv0 = BasicConv(out_channels, out_channels//2, 1)
self.split_conv1 = BasicConv(out_channels, out_channels//2, 1)
self.blocks_conv = nn.Sequential(
*[Resblock(out_channels//2) for _ in range(num_blocks)],
BasicConv(out_channels//2, out_channels//2, 1)
)
self.concat_conv = BasicConv(out_channels, out_channels, 1)
def forward(self, x):
x = self.downsample_conv(x)
x0 = self.split_conv0(x)
x1 = self.split_conv1(x)
x1 = self.blocks_conv(x1)
# 将大残差边再堆叠回来
x = torch.cat([x1, x0], dim=1)
# 最后对通道数进行整合
x = self.concat_conv(x)
return x
# CSPdarknet53 的主体部分:输入为416x416x3图片,输出为三个有效特征层
class CSPDarkNet(nn.Module):
def __init__(self, layers=[1, 2, 8, 8, 4]):
super(CSPDarkNet, self).__init__()
self.inplanes = 32
# 416,416,3 -> 416,416,32
self.conv1 = BasicConv(3, self.inplanes, kernel_size=3, stride=1)
self.feature_channels = [64, 128, 256, 512, 1024]
self.stages = nn.ModuleList([
# 416,416,32 -> 208,208,64
Resblock_body(self.inplanes, self.feature_channels[0], layers[0], first=True),
# 208,208,64 -> 104,104,128
Resblock_body(self.feature_channels[0], self.feature_channels[1], layers[1], first=False),
# 104,104,128 -> 52,52,256
Resblock_body(self.feature_channels[1], self.feature_channels[2], layers[2], first=False),
# 52,52,256 -> 26,26,512
Resblock_body(self.feature_channels[2], self.feature_channels[3], layers[3], first=False),
# 26,26,512 -> 13,13,1024
Resblock_body(self.feature_channels[3], self.feature_channels[4], layers[4], first=False)
])
self.num_features = 1
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def forward(self, x):
x = self.conv1(x)
x = self.stages[0](x)
x = self.stages[1](x)
out3 = self.stages[2](x)
out4 = self.stages[3](out3)
out5 = self.stages[4](out4)
return out3, out4, out5
(2)neck:SPP+PANet
SPP结构对CSPDarknet53的最后一个resblock_body的$13×13$的输出特征分别用四个不同池化核($13×13, 9×9, 5×5, 1×1$)进行最大池化。SPP前后各进行三次DarknetConv2D_BN_Mish卷积。
def conv2d(filter_in, filter_out, kernel_size, stride=1):
pad = (kernel_size - 1) // 2 if kernel_size else 0
return nn.Sequential(OrderedDict([
("conv", nn.Conv2d(filter_in, filter_out, kernel_size=kernel_size, stride=stride, padding=pad, bias=False)),
("bn", nn.BatchNorm2d(filter_out)),
("relu", nn.LeakyReLU(0.1)),
]))
# 三次卷积块
def make_three_conv(filters_list, in_filters):
m = nn.Sequential(
conv2d(in_filters, filters_list[0], 1),
conv2d(filters_list[0], filters_list[1], 3),
conv2d(filters_list[1], filters_list[0], 1),
)
return m
# SPP结构,利用不同大小的池化核进行池化,池化后堆叠
class SpatialPyramidPooling(nn.Module):
def __init__(self, pool_sizes=[5, 9, 13]):
super(SpatialPyramidPooling, self).__init__()
self.maxpools = nn.ModuleList([nn.MaxPool2d(pool_size, 1, pool_size//2) for pool_size in pool_sizes])
def forward(self, x):
features = [maxpool(x) for maxpool in self.maxpools[::-1]]
features = torch.cat(features + [x], dim=1)
return features
PANet (Path Aggregation Network)是一种反复提取和融合特征的网络结构。FPN已经证明了加入自顶向下的路径能够增加高层的语义;考虑到底层的特征有利于目标定位,PANet在FPN的P2-P5特征层又加入了自底向上的路径,以增加底层特征。
YOLOv4进一步修改了PANet的特征连接方式,把残差的相加连接换成了聚合连接:
(3)head
YOLOV4最后得到三种输出特征,其尺寸分别是$n×13×13×75$、$n×26×26×75$和$n×52×52×75$。其中的$13×13$、$26×26$和$52×52$对应原图像的部分区域,分别在原图中检测大物体、中等物体和小物体。
最后一个维度为$75$可以拆分成$3×(1+4+20)$:
- $3$是指每个特征映射的每个位置预先设定$3$个anchor先验框;
- $1$用来表示该位置是物体还是背景;
- $4$用来表示bbox的中心坐标和高宽;
- $20$是指VOC数据集的$20$类(条件概率)。
anchor先验框的尺寸是通过k-means聚类得到的。
YOLOv4的完整PyTorch实现可参考 yolov4-pytorch。
# 卷积 + 上采样
class Upsample(nn.Module):
def __init__(self, in_channels, out_channels):
super(Upsample, self).__init__()
self.upsample = nn.Sequential(
conv2d(in_channels, out_channels, 1),
nn.Upsample(scale_factor=2, mode='nearest')
)
def forward(self, x,):
x = self.upsample(x)
return x
# 五次卷积块
def make_five_conv(filters_list, in_filters):
m = nn.Sequential(
conv2d(in_filters, filters_list[0], 1),
conv2d(filters_list[0], filters_list[1], 3),
conv2d(filters_list[1], filters_list[0], 1),
conv2d(filters_list[0], filters_list[1], 3),
conv2d(filters_list[1], filters_list[0], 1),
)
return m
# 最后获得yolov4的输出
def yolo_head(filters_list, in_filters):
m = nn.Sequential(
conv2d(in_filters, filters_list[0], 3),
nn.Conv2d(filters_list[0], filters_list[1], 1),
)
return m
class YoloBody(nn.Module):
def __init__(self, anchors_mask=[[6, 7, 8], [3, 4, 5], [0, 1, 2]], num_classes=20, pretrained = False):
super(YoloBody, self).__init__()
# 生成CSPdarknet53的主干模型
self.backbone = darknet53(pretrained)
self.conv1 = make_three_conv([512,1024],1024)
self.SPP = SpatialPyramidPooling()
self.conv2 = make_three_conv([512,1024],2048)
self.upsample1 = Upsample(512,256)
self.conv_for_P4 = conv2d(512,256,1)
self.make_five_conv1 = make_five_conv([256, 512],512)
self.upsample2 = Upsample(256,128)
self.conv_for_P3 = conv2d(256,128,1)
self.make_five_conv2 = make_five_conv([128, 256],256)
# 3*(5+num_classes) = 3*(5+20) = 3*(4+1+20)=75
self.yolo_head3 = yolo_head([256, len(anchors_mask[0]) * (5 + num_classes)],128)
self.down_sample1 = conv2d(128,256,3,stride=2)
self.make_five_conv3 = make_five_conv([256, 512],512)
# 3*(5+num_classes) = 3*(5+20) = 3*(4+1+20)=75
self.yolo_head2 = yolo_head([512, len(anchors_mask[1]) * (5 + num_classes)],256)
self.down_sample2 = conv2d(256,512,3,stride=2)
self.make_five_conv4 = make_five_conv([512, 1024],1024)
# 3*(5+num_classes)=3*(5+20)=3*(4+1+20)=75
self.yolo_head1 = yolo_head([1024, len(anchors_mask[2]) * (5 + num_classes)],512)
def forward(self, x):
# backbone
x2, x1, x0 = self.backbone(x)
# 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,2048
P5 = self.conv1(x0)
P5 = self.SPP(P5)
# 13,13,2048 -> 13,13,512 -> 13,13,1024 -> 13,13,512
P5 = self.conv2(P5)
# 13,13,512 -> 13,13,256 -> 26,26,256
P5_upsample = self.upsample1(P5)
# 26,26,512 -> 26,26,256
P4 = self.conv_for_P4(x1)
# 26,26,256 + 26,26,256 -> 26,26,512
P4 = torch.cat([P4,P5_upsample],axis=1)
# 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
P4 = self.make_five_conv1(P4)
# 26,26,256 -> 26,26,128 -> 52,52,128
P4_upsample = self.upsample2(P4)
# 52,52,256 -> 52,52,128
P3 = self.conv_for_P3(x2)
# 52,52,128 + 52,52,128 -> 52,52,256
P3 = torch.cat([P3,P4_upsample],axis=1)
# 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128
P3 = self.make_five_conv2(P3)
# 52,52,128 -> 26,26,256
P3_downsample = self.down_sample1(P3)
# 26,26,256 + 26,26,256 -> 26,26,512
P4 = torch.cat([P3_downsample,P4],axis=1)
# 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
P4 = self.make_five_conv3(P4)
# 26,26,256 -> 13,13,512
P4_downsample = self.down_sample2(P4)
# 13,13,512 + 13,13,512 -> 13,13,1024
P5 = torch.cat([P4_downsample,P5],axis=1)
# 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512
P5 = self.make_five_conv4(P5)
# 第三个特征层 y3=(batch_size,75,52,52)
out2 = self.yolo_head3(P3)
# 第二个特征层 y2=(batch_size,75,26,26)
out1 = self.yolo_head2(P4)
# 第一个特征层 y1=(batch_size,75,13,13)
out0 = self.yolo_head1(P5)
return out0, out1, out2
2. YOLOv4的损失函数
经过YOLOV4主干网络得到的三个输出特征尺寸(在pytorch格式下)为$n×75×13×13$、$n×75×26×26$和$n×75×52×52$。
将其调整为:$(n×3×h×w×25)$,相当于把原图像划分成$h×w$个子区域,每个区域的中心设置了$3$个anchor,每个anchor具有$25$个参数$y_{pre}=(x_{pred},y_{pred},h_{pred},w_{pred},p_{pred},c_{pred})$。
在计算loss的时候,实际上是y_pre和y_true之间的对比:
- y_pre是一幅图像经过网络之后的输出,内部含有三个特征层的内容;需要解码才能够在图上画出边界框;
- y_true是一个真实图像的标签,包括每个真实框对应网格上的偏移位置、长宽与种类。仍需要编码才能与y_pred的结构一致。
对于每一个输出特征层,损失函数的计算如下:
- 利用y_true取出该特征层中真实存在目标的点的位置$(n×3×h×w×1)$及其对应的种类$(n×3×h×w×20)$;
- 将预测值输出进行处理,得到尺寸为$(n×3×h×w×25)$的预测值,还有解码后的$xywh$;
- 对于每一幅图,计算其中所有真实框与预测框的IoU,如果某些预测框和真实框的重合程度小于0.5,则忽略;
- 计算CIoU作为回归损失,这里只计算正样本的回归损失;
- 计算置信度损失,其有两部分构成,第一部分是实际上存在目标的,预测结果中置信度的值与1对比;第二部分是实际上不存在目标的,在第四步中得到其最大IoU的值与0对比;
- 计算预测种类损失,其计算的是实际上存在目标的,预测类与真实类的差距。
YOLOv4的损失函数为:
\[\begin{aligned} \mathcal{L}_{\mathrm{loc}} & =\lambda_{\text {coord }} \sum_{i=0}^{S^2} \sum_{j=0}^B \mathbb{1}_{i j}^{\mathrm{obj}}\left[\text{CIoU loss}\left((x_i,y_i,w_i,h_i),(\hat{x}_i,\hat{y}_i,\hat{w}_i,\hat{h}_i)\right)\right] \\ \mathcal{L}_{\mathrm{cls}} & =\sum_{i=0}^{S^2} \sum_{j=0}^B\left(\mathbb{1}_{i j}^{\mathrm{obj}}+\lambda_{\text {noobj }}\left(1-\mathbb{1}_{i j}^{\mathrm{obj}}\right)\right)\left(-C_{i j}\log \hat{C}_{i j}-(1-C_{i j})\log (1-\hat{C}_{i j})\right) \\ &+\sum_{i=0}^{S^2} \sum_{c \in \mathcal{C}} \mathbb{1}_i^{\mathrm{obj}}\left(-p_i(c)\log\hat{p}_i(c)-(1-p_i(c))\log(1-\hat{p}_i(c))\right) \\ \mathcal{L} & =\mathcal{L}_{\mathrm{loc}}+\mathcal{L}_{\mathrm{cls}} \end{aligned}\]其中边界框回归损失选用CIoU损失,在IoU的基础上加上中心点的归一化距离惩罚与长宽比影响因子$\alpha v$:
\[\text{CIoU} = \text{IoU} - \frac{ρ^2(b_{pred},b_{gt})}{c^2} - \alpha v\]其中$v$衡量边界框长宽比的一致性,$\alpha$用于平衡$v$的值,计算如下:
\[\begin{aligned} v &= \frac{4}{\pi^2} (\arctan\frac{w^{gt}}{h^{gt}}-\arctan\frac{w}{h})^2 \\ \alpha &= \frac{v}{(1-\text{IoU})+v} \end{aligned}\] def box_iou(self, b1, b2):
"""
输入为:
b1: tensor, shape=(batch, anchor_num, feat_w, feat_h, 4), xywh
b2: tensor, shape=(batch, anchor_num, feat_w, feat_h, 4), xywh
返回为:
out: tensor, shape=(batch, anchor_num, feat_w, feat_h)
"""
# 求出预测框左上角右下角
b1_xy = b1[..., :2]
b1_wh = b1[..., 2:4]
b1_wh_half = b1_wh / 2.
b1_mins = b1_xy - b1_wh_half
b1_maxes = b1_xy + b1_wh_half
# 求出真实框左上角右下角
b2_xy = b2[..., :2]
b2_wh = b2[..., 2:4]
b2_wh_half = b2_wh / 2.
b2_mins = b2_xy - b2_wh_half
b2_maxes = b2_xy + b2_wh_half
# 求真实框和预测框的iou
intersect_mins = torch.max(b1_mins, b2_mins)
intersect_maxes = torch.min(b1_maxes, b2_maxes)
intersect_wh = torch.max(intersect_maxes - intersect_mins, torch.zeros_like(intersect_maxes))
intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
b1_area = b1_wh[..., 0] * b1_wh[..., 1]
b2_area = b2_wh[..., 0] * b2_wh[..., 1]
union_area = b1_area + b2_area - intersect_area
iou = intersect_area / torch.clamp(union_area,min = 1e-6)
# 计算中心的差距
center_wh = b1_xy - b2_xy
# 找到包裹两个框的最小框的左上角和右下角
enclose_mins = torch.min(b1_mins, b2_mins)
enclose_maxes = torch.max(b1_maxes, b2_maxes)
enclose_wh = torch.max(enclose_maxes - enclose_mins, torch.zeros_like(intersect_maxes))
# 计算中心的距离
center_distance = torch.sum(torch.pow(center_wh, 2), axis=-1)
# 计算对角线距离
enclose_diagonal = torch.sum(torch.pow(enclose_wh, 2), axis=-1)
ciou = iou - 1.0 * (center_distance) / torch.clamp(enclose_diagonal, min = 1e-6)
v = (4 / (math.pi ** 2)) * torch.pow((torch.atan(b1_wh[..., 0] / torch.clamp(b1_wh[..., 1],min = 1e-6)) - torch.atan(b2_wh[..., 0] / torch.clamp(b2_wh[..., 1], min = 1e-6))), 2)
alpha = v / torch.clamp((1.0 - iou + v), min = 1e-6)
out = ciou - alpha * v
return out
置信度分类损失与类别分类损失均选用二元交叉熵损失。
def clip_by_tensor(self, t, t_min, t_max):
t = t.float()
result = (t >= t_min).float() * t + (t < t_min).float() * t_min
result = (result <= t_max).float() * result + (result > t_max).float() * t_max
return result
def BCELoss(self, pred, target):
epsilon = 1e-7
pred = self.clip_by_tensor(pred, epsilon, 1.0 - epsilon)
output = - target * torch.log(pred) - (1.0 - target) * torch.log(1.0 - pred)
return output
YOLOv4引入了标签平滑技巧。如原始的标签是$0$、$1$(如果是二分类),在平滑后变成$0.005$、$0.995$,也就是说对分类准确做了一点惩罚,让模型不可以分类的太准确,太准确容易过拟合。
def smooth_labels(self, y_true, label_smoothing, num_classes):
return y_true * (1.0 - label_smoothing) + label_smoothing / num_classes
YOLOv4的总损失定义为:
class YOLOLoss(nn.Module):
def __init__(self, anchors, num_classes, input_shape, anchors_mask = [[6,7,8], [3,4,5], [0,1,2]], label_smoothing = 0, alpha = 0.25, gamma = 2):
super(YOLOLoss, self).__init__()
# 13x13的特征层对应的anchor是[142, 110],[192, 243],[459, 401]
# 26x26的特征层对应的anchor是[36, 75],[76, 55],[72, 146]
# 52x52的特征层对应的anchor是[12, 16],[19, 36],[40, 28]
self.anchors = anchors
self.num_classes = num_classes
self.bbox_attrs = 5 + num_classes
self.input_shape = input_shape
self.anchors_mask = anchors_mask
self.label_smoothing = label_smoothing
self.balance = [0.4, 1.0, 4]
self.box_ratio = 0.05
self.obj_ratio = 5 * (input_shape[0] * input_shape[1]) / (416 ** 2)
self.cls_ratio = 1 * (num_classes / 80)
self.alpha = alpha
self.gamma = gamma
self.ignore_threshold = 0.5
def forward(self, l, input, targets=None):
# l 代表使用的是第几个有效特征层
# input的shape为 bs, 3*(5+num_classes), 13, 13
# bs, 3*(5+num_classes), 26, 26
# bs, 3*(5+num_classes), 52, 52
# targets 真实框的标签情况 [batch_size, num_gt, 5]
# 获得图片数量,特征层的高和宽
bs = input.size(0)
in_h = input.size(2)
in_w = input.size(3)
# 每一个特征点对应原来的图片上多少个像素点(32、16、8)
stride_h = self.input_shape[0] / in_h
stride_w = self.input_shape[1] / in_w
# 此时获得的scaled_anchors大小是相对于特征层的
scaled_anchors = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in self.anchors]
# 输入的input一共有三个,他们的shape分别是
# bs, 3 * (5+num_classes), 13, 13 => bs, 3, 5 + num_classes, 13, 13 => batch_size, 3, 13, 13, 5 + num_classes
prediction = input.view(bs, len(self.anchors_mask[l]), self.bbox_attrs, in_h, in_w).permute(0, 1, 3, 4, 2).contiguous()
# 先验框的中心位置的调整参数
x = torch.sigmoid(prediction[..., 0])
y = torch.sigmoid(prediction[..., 1])
# 先验框的宽高调整参数
w = prediction[..., 2]
h = prediction[..., 3]
# 获得置信度,是否有物体
conf = torch.sigmoid(prediction[..., 4])
# 种类置信度
pred_cls = torch.sigmoid(prediction[..., 5:])
# 获得网络应该有的预测结果
y_true, noobj_mask = self.get_target(l, targets, scaled_anchors, in_h, in_w)
# 将预测结果进行解码,判断预测结果和真实值的重合程度
noobj_mask, pred_boxes = self.get_ignore(l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask)
loss = 0
obj_mask = y_true[..., 4] == 1
n = torch.sum(obj_mask)
if n != 0:
# 计算预测结果和真实结果的差距
iou = self.box_iou(pred_boxes, y_true[..., :4]).type_as(x)
obj_mask = obj_mask & torch.logical_not(torch.isnan(iou))
loss_loc = torch.mean((1 - iou)[obj_mask])
loss_cls = torch.mean(self.BCELoss(pred_cls[obj_mask], y_true[..., 5:][obj_mask]))
loss += loss_loc * self.box_ratio + loss_cls * self.cls_ratio
# 计算是否包含物体的置信度损失
loss_conf = torch.mean(self.BCELoss(conf, obj_mask.type_as(conf))[noobj_mask.bool() | obj_mask])
loss += loss_conf * self.balance[l] * self.obj_ratio
return loss
def calculate_iou(self, _box_a, _box_b):
# 计算真实框的左上角和右下角
b1_x1, b1_x2 = _box_a[:, 0] - _box_a[:, 2] / 2, _box_a[:, 0] + _box_a[:, 2] / 2
b1_y1, b1_y2 = _box_a[:, 1] - _box_a[:, 3] / 2, _box_a[:, 1] + _box_a[:, 3] / 2
# 计算先验框获得的预测框的左上角和右下角
b2_x1, b2_x2 = _box_b[:, 0] - _box_b[:, 2] / 2, _box_b[:, 0] + _box_b[:, 2] / 2
b2_y1, b2_y2 = _box_b[:, 1] - _box_b[:, 3] / 2, _box_b[:, 1] + _box_b[:, 3] / 2
# 将真实框和预测框都转化成左上角右下角的形式
box_a = torch.zeros_like(_box_a)
box_b = torch.zeros_like(_box_b)
box_a[:, 0], box_a[:, 1], box_a[:, 2], box_a[:, 3] = b1_x1, b1_y1, b1_x2, b1_y2
box_b[:, 0], box_b[:, 1], box_b[:, 2], box_b[:, 3] = b2_x1, b2_y1, b2_x2, b2_y2
# A为真实框的数量,B为先验框的数量
A = box_a.size(0)
B = box_b.size(0)
# 计算交的面积
max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), box_b[:, :2].unsqueeze(0).expand(A, B, 2))
inter = torch.clamp((max_xy - min_xy), min=0)
inter = inter[:, :, 0] * inter[:, :, 1]
# 计算预测框和真实框各自的面积
area_a = ((box_a[:, 2]-box_a[:, 0]) * (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
area_b = ((box_b[:, 2]-box_b[:, 0]) * (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
# 求IOU
union = area_a + area_b - inter
return inter / union # [A,B]
def get_target(self, l, targets, anchors, in_h, in_w):
# 计算一共有多少张图片
bs = len(targets)
# 用于选取哪些先验框不包含物体
noobj_mask = torch.ones(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad = False)
# 构造训练标签:batch_size, 3, 13, 13, 5 + num_classes
y_true = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, self.bbox_attrs, requires_grad = False)
for b in range(bs):
if len(targets[b])==0:
continue
batch_target = torch.zeros_like(targets[b])
# 计算出正样本在特征层上的中心点
batch_target[:, [0,2]] = targets[b][:, [0,2]] * in_w
batch_target[:, [1,3]] = targets[b][:, [1,3]] * in_h
batch_target[:, 4] = targets[b][:, 4]
batch_target = batch_target.cpu()
# 将真实框转换一个形式:num_true_box, 4
gt_box = torch.FloatTensor(torch.cat((torch.zeros((batch_target.size(0), 2)), batch_target[:, 2:4]), 1))
# 将先验框转换一个形式:9, 4
anchor_shapes = torch.FloatTensor(torch.cat((torch.zeros((len(anchors), 2)), torch.FloatTensor(anchors)), 1))
# best_ns: [每个真实框最大的重合度max_iou, 每一个真实框最重合的先验框的序号]
best_ns = torch.argmax(self.calculate_iou(gt_box, anchor_shapes), dim=-1)
for t, best_n in enumerate(best_ns):
if best_n not in self.anchors_mask[l]:
continue
# 判断这个先验框是当前特征点的哪一个先验框
k = self.anchors_mask[l].index(best_n)
# 获得真实框属于哪个网格点
i = torch.floor(batch_target[t, 0]).long()
j = torch.floor(batch_target[t, 1]).long()
# 取出真实框的种类
c = batch_target[t, 4].long()
# noobj_mask代表无目标的特征点
noobj_mask[b, k, j, i] = 0
# tx、ty代表中心调整参数的真实值
y_true[b, k, j, i, 0] = batch_target[t, 0]
y_true[b, k, j, i, 1] = batch_target[t, 1]
y_true[b, k, j, i, 2] = batch_target[t, 2]
y_true[b, k, j, i, 3] = batch_target[t, 3]
y_true[b, k, j, i, 4] = 1
y_true[b, k, j, i, c + 5] = 1
return y_true, noobj_mask
def get_ignore(self, l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask):
# 计算一共有多少张图片
bs = len(targets)
# 生成网格,先验框中心,网格左上角
grid_x = torch.linspace(0, in_w - 1, in_w).repeat(in_h, 1).repeat(
int(bs * len(self.anchors_mask[l])), 1, 1).view(x.shape).type_as(x)
grid_y = torch.linspace(0, in_h - 1, in_h).repeat(in_w, 1).t().repeat(
int(bs * len(self.anchors_mask[l])), 1, 1).view(y.shape).type_as(x)
# 生成先验框的宽高
scaled_anchors_l = np.array(scaled_anchors)[self.anchors_mask[l]]
anchor_w = torch.Tensor(scaled_anchors_l).index_select(1, torch.LongTensor([0])).type_as(x)
anchor_h = torch.Tensor(scaled_anchors_l).index_select(1, torch.LongTensor([1])).type_as(x)
anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape)
anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape)
# 计算调整后的先验框中心与宽高
pred_boxes_x = torch.unsqueeze(x + grid_x, -1)
pred_boxes_y = torch.unsqueeze(y + grid_y, -1)
pred_boxes_w = torch.unsqueeze(torch.exp(w) * anchor_w, -1)
pred_boxes_h = torch.unsqueeze(torch.exp(h) * anchor_h, -1)
pred_boxes = torch.cat([pred_boxes_x, pred_boxes_y, pred_boxes_w, pred_boxes_h], dim = -1)
for b in range(bs):
# 将预测结果转换形式:num_anchors, 4
pred_boxes_for_ignore = pred_boxes[b].view(-1, 4)
# 计算真实框num_true_box, 4,并把真实框转换成相对于特征层的大小
if len(targets[b]) > 0:
batch_target = torch.zeros_like(targets[b])
# 计算出正样本在特征层上的中心点
batch_target[:, [0,2]] = targets[b][:, [0,2]] * in_w
batch_target[:, [1,3]] = targets[b][:, [1,3]] * in_h
batch_target = batch_target[:, :4].type_as(x)
# 计算交并比
anch_ious = self.calculate_iou(batch_target, pred_boxes_for_ignore)
# 每个先验框对应真实框的最大重合度
anch_ious_max, _ = torch.max(anch_ious, dim = 0)
anch_ious_max = anch_ious_max.view(pred_boxes[b].size()[:3])
noobj_mask[b][anch_ious_max > self.ignore_threshold] = 0
return noobj_mask, pred_boxes
在训练时累计三个特征图上的损失:
yolo_loss = YOLOLoss(anchors, num_classes, input_shape, anchors_mask, label_smoothing, focal_loss, focal_alpha, focal_gamma, iou_type)
optimizer.zero_grad()
outputs = model_train(images)
loss_value_all = 0
for l in range(len(outputs)):
loss_item = yolo_loss(l, outputs[l], targets)
loss_value_all += loss_item
loss_value = loss_value_all
loss_value.backward()
optimizer.step()
YOLOv4在训练时采用了余弦退火衰减法(Cosine annealing),学习率会先上升再下降,上升的时候使用线性上升,下降的时候模拟cos函数下降。执行多次。pytorch有现成的调用函数:
lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=1e-5)
3. YOLOv4的其他改进
(1)数据增强:Mosaic
Mosaic对四张训练图片进行融合,能丰富检测物体的背景,提高batch size(相当于增大四倍),在计算BN统计量时分别从四张图像中计算,使BN更稳定。
- 每次读取四张图片;
- 分别对四张图片进行翻转、缩放、色域变化等,并且按照左上、左下、右上、右下四个方向位置摆好;
- 进行图片的组合和框的组合.
def merge_bboxes(self, bboxes, cutx, cuty):
merge_bbox = []
for i in range(len(bboxes)):
for box in bboxes[i]:
tmp_box = []
x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
if i == 0:
if y1 > cuty or x1 > cutx:
continue
if y2 >= cuty and y1 <= cuty:
y2 = cuty
if x2 >= cutx and x1 <= cutx:
x2 = cutx
if i == 1:
if y2 < cuty or x1 > cutx:
continue
if y2 >= cuty and y1 <= cuty:
y1 = cuty
if x2 >= cutx and x1 <= cutx:
x2 = cutx
if i == 2:
if y2 < cuty or x2 < cutx:
continue
if y2 >= cuty and y1 <= cuty:
y1 = cuty
if x2 >= cutx and x1 <= cutx:
x1 = cutx
if i == 3:
if y1 > cuty or x2 < cutx:
continue
if y2 >= cuty and y1 <= cuty:
y2 = cuty
if x2 >= cutx and x1 <= cutx:
x1 = cutx
tmp_box.append(x1)
tmp_box.append(y1)
tmp_box.append(x2)
tmp_box.append(y2)
tmp_box.append(box[-1])
merge_bbox.append(tmp_box)
return merge_bbox
def get_random_data_with_Mosaic(self, annotation_line, input_shape, jitter=0.3, hue=.1, sat=0.7, val=0.4):
h, w = input_shape
min_offset_x = self.rand(0.3, 0.7)
min_offset_y = self.rand(0.3, 0.7)
image_datas = []
box_datas = []
index = 0
for line in annotation_line:
# 读取图片
line_content = line.split()
image = Image.open(line_content[0])
image = cvtColor(image)
# 图片的大小
iw, ih = image.size
# 保存框的位置
box = np.array([np.array(list(map(int,box.split(',')))) for box in line_content[1:]])
# 是否翻转图片
flip = self.rand()<.5
if flip and len(box)>0:
image = image.transpose(Image.FLIP_LEFT_RIGHT)
box[:, [0,2]] = iw - box[:, [2,0]]
# 对图像进行缩放并且进行长和宽的扭曲
new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)
scale = self.rand(.4, 1)
if new_ar < 1:
nh = int(scale*h)
nw = int(nh*new_ar)
else:
nw = int(scale*w)
nh = int(nw/new_ar)
image = image.resize((nw, nh), Image.BICUBIC)
# 将图片进行放置,分别对应四张分割图片的位置
if index == 0:
dx = int(w*min_offset_x) - nw
dy = int(h*min_offset_y) - nh
elif index == 1:
dx = int(w*min_offset_x) - nw
dy = int(h*min_offset_y)
elif index == 2:
dx = int(w*min_offset_x)
dy = int(h*min_offset_y)
elif index == 3:
dx = int(w*min_offset_x)
dy = int(h*min_offset_y) - nh
new_image = Image.new('RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image_data = np.array(new_image)
index = index + 1
box_data = []
# 对box进行重新处理
if len(box)>0:
np.random.shuffle(box)
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)]
box_data = np.zeros((len(box),5))
box_data[:len(box)] = box
image_datas.append(image_data)
box_datas.append(box_data)
# 将图片分割,放在一起
cutx = int(w * min_offset_x)
cuty = int(h * min_offset_y)
new_image = np.zeros([h, w, 3])
new_image[:cuty, :cutx, :] = image_datas[0][:cuty, :cutx, :]
new_image[cuty:, :cutx, :] = image_datas[1][cuty:, :cutx, :]
new_image[cuty:, cutx:, :] = image_datas[2][cuty:, cutx:, :]
new_image[:cuty, cutx:, :] = image_datas[3][:cuty, cutx:, :]
new_image = np.array(new_image, np.uint8)
# 对框进行进一步的处理
new_boxes = self.merge_bboxes(box_datas, cutx, cuty)
return new_image, new_boxes
(2)Self-Adversarial Training (SAT)
- 在第一阶段,网络对原始图像进行对抗攻击,通过生成对抗噪声使得网络检测不到图像中的目标;
- 在第二阶段,训练网络以正常方式检测该修改图像上的目标。