YOLOv4：优化目标检测的速度和精度.

paper：YOLOv4: Optimal Speed and Accuracy of Object Detection

1. YOLOv4的网络结构

YOLOV4继承了YOLOv3的单阶段检测器主体结构，主要结构包括：

backbone：CSPDarknet53（YOLOV3是Darknet53）
neck：SPP+PANet，提取三种尺寸的特征映射
head：同YOLOV3

当输入图像尺寸是$416×416$时，网络结构如下：

（1）backbone：CSPDarknet53

YOLOV4的骨干网络沿用了Darknet53中定义的resblock_body模块，其由一次下采样和多个残差块的堆叠构成（下图左）；在此基础上引入了CSPNet (Cross Stage Partial Network)，主干部分继续进行原来的残差块的堆叠，另一部分则引入了一个全局的残差连接（下图右）。修改后的结构称为CSPDarknet53。

其中激活函数使用Mish，表达式如下：

\[Mish(x) = x·\tanh(\log(1+e^x))\]

#   MISH激活函数
class Mish(nn.Module):
    def __init__(self):
        super(Mish, self).__init__()

    def forward(self, x):
        return x * torch.tanh(F.softplus(x))

#   Conv2d + BatchNormalization + Mish
class BasicConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1):
        super(BasicConv, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, kernel_size//2, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)
        self.activation = Mish()

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.activation(x)
        return x

#   CSPdarknet内部堆叠的残差块
class Resblock(nn.Module):
    def __init__(self, channels, hidden_channels=None):
        super(Resblock, self).__init__()
        if hidden_channels is None:
            hidden_channels = channels
        self.block = nn.Sequential(
            BasicConv(channels, hidden_channels, 1),
            BasicConv(hidden_channels, channels, 3)
        )

    def forward(self, x):
        return x + self.block(x)


#   CSPdarknet的结构块
class Resblock_body(nn.Module):
    def __init__(self, in_channels, out_channels, num_blocks, first):
        super(Resblock_body, self).__init__()
        #   利用一个步长为2x2的卷积块进行高和宽的压缩
        self.downsample_conv = BasicConv(in_channels, out_channels, 3, stride=2)
        if first:
            #   然后建立一个大的残差边self.split_conv0、这个大残差边绕过了很多的残差结构
            self.split_conv0 = BasicConv(out_channels, out_channels, 1)
            #   主干部分会对num_blocks进行循环，循环内部是残差结构
            self.split_conv1 = BasicConv(out_channels, out_channels, 1)  
            self.blocks_conv = nn.Sequential(
                Resblock(channels=out_channels, hidden_channels=out_channels//2),
                BasicConv(out_channels, out_channels, 1)
            )
            self.concat_conv = BasicConv(out_channels*2, out_channels, 1)
        else:
            self.split_conv0 = BasicConv(out_channels, out_channels//2, 1)
            self.split_conv1 = BasicConv(out_channels, out_channels//2, 1)
            self.blocks_conv = nn.Sequential(
                *[Resblock(out_channels//2) for _ in range(num_blocks)],
                BasicConv(out_channels//2, out_channels//2, 1)
            )
            self.concat_conv = BasicConv(out_channels, out_channels, 1)

    def forward(self, x):
        x = self.downsample_conv(x)
        x0 = self.split_conv0(x)
        x1 = self.split_conv1(x)
        x1 = self.blocks_conv(x1)
        #   将大残差边再堆叠回来
        x = torch.cat([x1, x0], dim=1)
        #   最后对通道数进行整合
        x = self.concat_conv(x)
        return x

#   CSPdarknet53 的主体部分：输入为416x416x3图片，输出为三个有效特征层
class CSPDarkNet(nn.Module):
    def __init__(self, layers=[1, 2, 8, 8, 4]):
        super(CSPDarkNet, self).__init__()
        self.inplanes = 32
        # 416,416,3 -> 416,416,32
        self.conv1 = BasicConv(3, self.inplanes, kernel_size=3, stride=1)
        self.feature_channels = [64, 128, 256, 512, 1024]

        self.stages = nn.ModuleList([
            # 416,416,32 -> 208,208,64
            Resblock_body(self.inplanes, self.feature_channels[0], layers[0], first=True),
            # 208,208,64 -> 104,104,128
            Resblock_body(self.feature_channels[0], self.feature_channels[1], layers[1], first=False),
            # 104,104,128 -> 52,52,256
            Resblock_body(self.feature_channels[1], self.feature_channels[2], layers[2], first=False),
            # 52,52,256 -> 26,26,512
            Resblock_body(self.feature_channels[2], self.feature_channels[3], layers[3], first=False),
            # 26,26,512 -> 13,13,1024
            Resblock_body(self.feature_channels[3], self.feature_channels[4], layers[4], first=False)
        ])

        self.num_features = 1
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def forward(self, x):
        x = self.conv1(x)
        x = self.stages[0](x)
        x = self.stages[1](x)
        out3 = self.stages[2](x)
        out4 = self.stages[3](out3)
        out5 = self.stages[4](out4)
        return out3, out4, out5

（2）neck：SPP+PANet

SPP结构对CSPDarknet53的最后一个resblock_body的$13×13$的输出特征分别用四个不同池化核（$13×13, 9×9, 5×5, 1×1$）进行最大池化。SPP前后各进行三次DarknetConv2D_BN_Mish卷积。

def conv2d(filter_in, filter_out, kernel_size, stride=1):
    pad = (kernel_size - 1) // 2 if kernel_size else 0
    return nn.Sequential(OrderedDict([
        ("conv", nn.Conv2d(filter_in, filter_out, kernel_size=kernel_size, stride=stride, padding=pad, bias=False)),
        ("bn", nn.BatchNorm2d(filter_out)),
        ("relu", nn.LeakyReLU(0.1)),
    ]))

#   三次卷积块
def make_three_conv(filters_list, in_filters):
    m = nn.Sequential(
        conv2d(in_filters, filters_list[0], 1),
        conv2d(filters_list[0], filters_list[1], 3),
        conv2d(filters_list[1], filters_list[0], 1),
    )
    return m
    
#   SPP结构，利用不同大小的池化核进行池化，池化后堆叠
class SpatialPyramidPooling(nn.Module):
    def __init__(self, pool_sizes=[5, 9, 13]):
        super(SpatialPyramidPooling, self).__init__()
        self.maxpools = nn.ModuleList([nn.MaxPool2d(pool_size, 1, pool_size//2) for pool_size in pool_sizes])

    def forward(self, x):
        features = [maxpool(x) for maxpool in self.maxpools[::-1]]
        features = torch.cat(features + [x], dim=1)
        return features

PANet (Path Aggregation Network)是一种反复提取和融合特征的网络结构。FPN已经证明了加入自顶向下的路径能够增加高层的语义；考虑到底层的特征有利于目标定位，PANet在FPN的P2-P5特征层又加入了自底向上的路径，以增加底层特征。

YOLOv4进一步修改了PANet的特征连接方式，把残差的相加连接换成了聚合连接：

（3）head

YOLOV4最后得到三种输出特征，其尺寸分别是$n×13×13×75$、$n×26×26×75$和$n×52×52×75$。其中的$13×13$、$26×26$和$52×52$对应原图像的部分区域，分别在原图中检测大物体、中等物体和小物体。

最后一个维度为$75$可以拆分成$3×(1+4+20)$:

$3$是指每个特征映射的每个位置预先设定$3$个anchor先验框；
$1$用来表示该位置是物体还是背景；
$4$用来表示bbox的中心坐标和高宽；
$20$是指VOC数据集的$20$类（条件概率）。

anchor先验框的尺寸是通过k-means聚类得到的。

YOLOv4的完整PyTorch实现可参考 yolov4-pytorch。

#   卷积 + 上采样
class Upsample(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Upsample, self).__init__()
        self.upsample = nn.Sequential(
            conv2d(in_channels, out_channels, 1),
            nn.Upsample(scale_factor=2, mode='nearest')
        )

    def forward(self, x,):
        x = self.upsample(x)
        return x

#   五次卷积块
def make_five_conv(filters_list, in_filters):
    m = nn.Sequential(
        conv2d(in_filters, filters_list[0], 1),
        conv2d(filters_list[0], filters_list[1], 3),
        conv2d(filters_list[1], filters_list[0], 1),
        conv2d(filters_list[0], filters_list[1], 3),
        conv2d(filters_list[1], filters_list[0], 1),
    )
    return m

#   最后获得yolov4的输出
def yolo_head(filters_list, in_filters):
    m = nn.Sequential(
        conv2d(in_filters, filters_list[0], 3),
        nn.Conv2d(filters_list[0], filters_list[1], 1),
    )
    return m
   
   
class YoloBody(nn.Module):
    def __init__(self, anchors_mask=[[6, 7, 8], [3, 4, 5], [0, 1, 2]], num_classes=20, pretrained = False):
        super(YoloBody, self).__init__()
        #   生成CSPdarknet53的主干模型
        self.backbone   = darknet53(pretrained)

        self.conv1      = make_three_conv([512,1024],1024)
        self.SPP        = SpatialPyramidPooling()
        self.conv2      = make_three_conv([512,1024],2048)

        self.upsample1          = Upsample(512,256)
        self.conv_for_P4        = conv2d(512,256,1)
        self.make_five_conv1    = make_five_conv([256, 512],512)

        self.upsample2          = Upsample(256,128)
        self.conv_for_P3        = conv2d(256,128,1)
        self.make_five_conv2    = make_five_conv([128, 256],256)

        # 3*(5+num_classes) = 3*(5+20) = 3*(4+1+20)=75
        self.yolo_head3         = yolo_head([256, len(anchors_mask[0]) * (5 + num_classes)],128)

        self.down_sample1       = conv2d(128,256,3,stride=2)
        self.make_five_conv3    = make_five_conv([256, 512],512)

        # 3*(5+num_classes) = 3*(5+20) = 3*(4+1+20)=75
        self.yolo_head2         = yolo_head([512, len(anchors_mask[1]) * (5 + num_classes)],256)

        self.down_sample2       = conv2d(256,512,3,stride=2)
        self.make_five_conv4    = make_five_conv([512, 1024],1024)

        # 3*(5+num_classes)=3*(5+20)=3*(4+1+20)=75
        self.yolo_head1         = yolo_head([1024, len(anchors_mask[2]) * (5 + num_classes)],512)

    def forward(self, x):
        #  backbone
        x2, x1, x0 = self.backbone(x)

        # 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,2048 
        P5 = self.conv1(x0)
        P5 = self.SPP(P5)
        # 13,13,2048 -> 13,13,512 -> 13,13,1024 -> 13,13,512
        P5 = self.conv2(P5)

        # 13,13,512 -> 13,13,256 -> 26,26,256
        P5_upsample = self.upsample1(P5)
        # 26,26,512 -> 26,26,256
        P4 = self.conv_for_P4(x1)
        # 26,26,256 + 26,26,256 -> 26,26,512
        P4 = torch.cat([P4,P5_upsample],axis=1)
        # 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
        P4 = self.make_five_conv1(P4)

        # 26,26,256 -> 26,26,128 -> 52,52,128
        P4_upsample = self.upsample2(P4)
        # 52,52,256 -> 52,52,128
        P3 = self.conv_for_P3(x2)
        # 52,52,128 + 52,52,128 -> 52,52,256
        P3 = torch.cat([P3,P4_upsample],axis=1)
        # 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128
        P3 = self.make_five_conv2(P3)

        # 52,52,128 -> 26,26,256
        P3_downsample = self.down_sample1(P3)
        # 26,26,256 + 26,26,256 -> 26,26,512
        P4 = torch.cat([P3_downsample,P4],axis=1)
        # 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
        P4 = self.make_five_conv3(P4)

        # 26,26,256 -> 13,13,512
        P4_downsample = self.down_sample2(P4)
        # 13,13,512 + 13,13,512 -> 13,13,1024
        P5 = torch.cat([P4_downsample,P5],axis=1)
        # 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512
        P5 = self.make_five_conv4(P5)

        #   第三个特征层 y3=(batch_size,75,52,52)
        out2 = self.yolo_head3(P3)
        #   第二个特征层 y2=(batch_size,75,26,26)
        out1 = self.yolo_head2(P4)
        #   第一个特征层 y1=(batch_size,75,13,13)
        out0 = self.yolo_head1(P5)
        return out0, out1, out2

2. YOLOv4的损失函数

经过YOLOV4主干网络得到的三个输出特征尺寸（在pytorch格式下）为$n×75×13×13$、$n×75×26×26$和$n×75×52×52$。

将其调整为：$(n×3×h×w×25)$，相当于把原图像划分成$h×w$个子区域，每个区域的中心设置了$3$个anchor，每个anchor具有$25$个参数$y_{pre}=(x_{pred},y_{pred},h_{pred},w_{pred},p_{pred},c_{pred})$。

在计算loss的时候，实际上是y_pre和y_true之间的对比：

y_pre是一幅图像经过网络之后的输出，内部含有三个特征层的内容；需要解码才能够在图上画出边界框；
y_true是一个真实图像的标签，包括每个真实框对应网格上的偏移位置、长宽与种类。仍需要编码才能与y_pred的结构一致。

对于每一个输出特征层，损失函数的计算如下：

利用y_true取出该特征层中真实存在目标的点的位置$(n×3×h×w×1)$及其对应的种类$(n×3×h×w×20)$;
将预测值输出进行处理，得到尺寸为$(n×3×h×w×25)$的预测值，还有解码后的$xywh$;
对于每一幅图，计算其中所有真实框与预测框的IoU，如果某些预测框和真实框的重合程度小于0.5，则忽略；
计算CIoU作为回归损失，这里只计算正样本的回归损失；
计算置信度损失，其有两部分构成，第一部分是实际上存在目标的，预测结果中置信度的值与1对比；第二部分是实际上不存在目标的，在第四步中得到其最大IoU的值与0对比；
计算预测种类损失，其计算的是实际上存在目标的，预测类与真实类的差距。

YOLOv4的损失函数为：

\[\begin{aligned} \mathcal{L}_{\mathrm{loc}} & =\lambda_{\text {coord }} \sum_{i=0}^{S^2} \sum_{j=0}^B \mathbb{1}_{i j}^{\mathrm{obj}}\left[\text{CIoU loss}\left((x_i,y_i,w_i,h_i),(\hat{x}_i,\hat{y}_i,\hat{w}_i,\hat{h}_i)\right)\right] \\ \mathcal{L}_{\mathrm{cls}} & =\sum_{i=0}^{S^2} \sum_{j=0}^B\left(\mathbb{1}_{i j}^{\mathrm{obj}}+\lambda_{\text {noobj }}\left(1-\mathbb{1}_{i j}^{\mathrm{obj}}\right)\right)\left(-C_{i j}\log \hat{C}_{i j}-(1-C_{i j})\log (1-\hat{C}_{i j})\right) \\ &+\sum_{i=0}^{S^2} \sum_{c \in \mathcal{C}} \mathbb{1}_i^{\mathrm{obj}}\left(-p_i(c)\log\hat{p}_i(c)-(1-p_i(c))\log(1-\hat{p}_i(c))\right) \\ \mathcal{L} & =\mathcal{L}_{\mathrm{loc}}+\mathcal{L}_{\mathrm{cls}} \end{aligned}\]

其中边界框回归损失选用CIoU损失，在IoU的基础上加上中心点的归一化距离惩罚与长宽比影响因子$\alpha v$：

\[\text{CIoU} = \text{IoU} - \frac{ρ^2(b_{pred},b_{gt})}{c^2} - \alpha v\]

其中$v$衡量边界框长宽比的一致性，$\alpha$用于平衡$v$的值，计算如下：

\[\begin{aligned} v &= \frac{4}{\pi^2} (\arctan\frac{w^{gt}}{h^{gt}}-\arctan\frac{w}{h})^2 \\ \alpha &= \frac{v}{(1-\text{IoU})+v} \end{aligned}\]

    def box_iou(self, b1, b2):
        """
        输入为：
        b1: tensor, shape=(batch, anchor_num, feat_w, feat_h, 4), xywh
        b2: tensor, shape=(batch, anchor_num, feat_w, feat_h, 4), xywh

        返回为：
        out: tensor, shape=(batch, anchor_num, feat_w, feat_h)
        """
        #   求出预测框左上角右下角
        b1_xy       = b1[..., :2]
        b1_wh       = b1[..., 2:4]
        b1_wh_half  = b1_wh / 2.
        b1_mins     = b1_xy - b1_wh_half
        b1_maxes    = b1_xy + b1_wh_half
        
        #   求出真实框左上角右下角
        b2_xy       = b2[..., :2]
        b2_wh       = b2[..., 2:4]
        b2_wh_half  = b2_wh / 2.
        b2_mins     = b2_xy - b2_wh_half
        b2_maxes    = b2_xy + b2_wh_half

        #   求真实框和预测框的iou
        intersect_mins  = torch.max(b1_mins, b2_mins)
        intersect_maxes = torch.min(b1_maxes, b2_maxes)
        intersect_wh    = torch.max(intersect_maxes - intersect_mins, torch.zeros_like(intersect_maxes))
        intersect_area  = intersect_wh[..., 0] * intersect_wh[..., 1]
        b1_area         = b1_wh[..., 0] * b1_wh[..., 1]
        b2_area         = b2_wh[..., 0] * b2_wh[..., 1]
        union_area      = b1_area + b2_area - intersect_area
        iou             = intersect_area / torch.clamp(union_area,min = 1e-6)

        #   计算中心的差距
        center_wh       = b1_xy - b2_xy
        
        #   找到包裹两个框的最小框的左上角和右下角
        enclose_mins    = torch.min(b1_mins, b2_mins)
        enclose_maxes   = torch.max(b1_maxes, b2_maxes)
        enclose_wh      = torch.max(enclose_maxes - enclose_mins, torch.zeros_like(intersect_maxes))

        #   计算中心的距离
        center_distance     = torch.sum(torch.pow(center_wh, 2), axis=-1)
        #   计算对角线距离
        enclose_diagonal    = torch.sum(torch.pow(enclose_wh, 2), axis=-1)
        ciou                = iou - 1.0 * (center_distance) / torch.clamp(enclose_diagonal, min = 1e-6)

        v       = (4 / (math.pi ** 2)) * torch.pow((torch.atan(b1_wh[..., 0] / torch.clamp(b1_wh[..., 1],min = 1e-6)) - torch.atan(b2_wh[..., 0] / torch.clamp(b2_wh[..., 1], min = 1e-6))), 2)
        alpha   = v / torch.clamp((1.0 - iou + v), min = 1e-6)
        out     = ciou - alpha * v
        return out 

置信度分类损失与类别分类损失均选用二元交叉熵损失。

    def clip_by_tensor(self, t, t_min, t_max):
        t = t.float()
        result = (t >= t_min).float() * t + (t < t_min).float() * t_min
        result = (result <= t_max).float() * result + (result > t_max).float() * t_max
        return result

    def BCELoss(self, pred, target):
        epsilon = 1e-7
        pred    = self.clip_by_tensor(pred, epsilon, 1.0 - epsilon)
        output  = - target * torch.log(pred) - (1.0 - target) * torch.log(1.0 - pred)
        return output

YOLOv4引入了标签平滑技巧。如原始的标签是$0$、$1$(如果是二分类)，在平滑后变成$0.005$、$0.995$，也就是说对分类准确做了一点惩罚，让模型不可以分类的太准确，太准确容易过拟合。

    def smooth_labels(self, y_true, label_smoothing, num_classes):
        return y_true * (1.0 - label_smoothing) + label_smoothing / num_classes

YOLOv4的总损失定义为：

class YOLOLoss(nn.Module):
    def __init__(self, anchors, num_classes, input_shape, anchors_mask = [[6,7,8], [3,4,5], [0,1,2]], label_smoothing = 0, alpha = 0.25, gamma = 2):
        super(YOLOLoss, self).__init__()
        #   13x13的特征层对应的anchor是[142, 110],[192, 243],[459, 401]
        #   26x26的特征层对应的anchor是[36, 75],[76, 55],[72, 146]
        #   52x52的特征层对应的anchor是[12, 16],[19, 36],[40, 28]
        self.anchors        = anchors
        self.num_classes    = num_classes
        self.bbox_attrs     = 5 + num_classes
        self.input_shape    = input_shape
        self.anchors_mask   = anchors_mask
        self.label_smoothing = label_smoothing

        self.balance        = [0.4, 1.0, 4]
        self.box_ratio      = 0.05
        self.obj_ratio      = 5 * (input_shape[0] * input_shape[1]) / (416 ** 2)
        self.cls_ratio      = 1 * (num_classes / 80)
        
        self.alpha              = alpha
        self.gamma              = gamma
        self.ignore_threshold = 0.5

    def forward(self, l, input, targets=None):
        #   l 代表使用的是第几个有效特征层
        #   input的shape为  bs, 3*(5+num_classes), 13, 13
        #                   bs, 3*(5+num_classes), 26, 26
        #                   bs, 3*(5+num_classes), 52, 52
        #   targets 真实框的标签情况 [batch_size, num_gt, 5]
        
        #   获得图片数量，特征层的高和宽
        bs      = input.size(0)
        in_h    = input.size(2)
        in_w    = input.size(3)

        #   每一个特征点对应原来的图片上多少个像素点（32、16、8）
        stride_h = self.input_shape[0] / in_h
        stride_w = self.input_shape[1] / in_w
        
        #   此时获得的scaled_anchors大小是相对于特征层的
        scaled_anchors  = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in self.anchors]
        
        #   输入的input一共有三个，他们的shape分别是
        #   bs, 3 * (5+num_classes), 13, 13 => bs, 3, 5 + num_classes, 13, 13 => batch_size, 3, 13, 13, 5 + num_classes
        prediction = input.view(bs, len(self.anchors_mask[l]), self.bbox_attrs, in_h, in_w).permute(0, 1, 3, 4, 2).contiguous()
        
        #   先验框的中心位置的调整参数
        x = torch.sigmoid(prediction[..., 0])
        y = torch.sigmoid(prediction[..., 1])
        #   先验框的宽高调整参数
        w = prediction[..., 2]
        h = prediction[..., 3]
        #   获得置信度，是否有物体
        conf = torch.sigmoid(prediction[..., 4])
        #   种类置信度
        pred_cls = torch.sigmoid(prediction[..., 5:])

        #   获得网络应该有的预测结果
        y_true, noobj_mask = self.get_target(l, targets, scaled_anchors, in_h, in_w)

        #   将预测结果进行解码，判断预测结果和真实值的重合程度
        noobj_mask, pred_boxes = self.get_ignore(l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask)

        loss        = 0
        obj_mask    = y_true[..., 4] == 1
        n           = torch.sum(obj_mask)
        if n != 0:
            #   计算预测结果和真实结果的差距
            iou         = self.box_iou(pred_boxes, y_true[..., :4]).type_as(x)
            obj_mask    = obj_mask & torch.logical_not(torch.isnan(iou))
            loss_loc    = torch.mean((1 - iou)[obj_mask])
            
            loss_cls    = torch.mean(self.BCELoss(pred_cls[obj_mask], y_true[..., 5:][obj_mask]))
            loss        += loss_loc * self.box_ratio + loss_cls * self.cls_ratio

        #   计算是否包含物体的置信度损失
        loss_conf   = torch.mean(self.BCELoss(conf, obj_mask.type_as(conf))[noobj_mask.bool() | obj_mask])
        loss += loss_conf * self.balance[l] * self.obj_ratio
        return loss

    def calculate_iou(self, _box_a, _box_b):
        #   计算真实框的左上角和右下角
        b1_x1, b1_x2 = _box_a[:, 0] - _box_a[:, 2] / 2, _box_a[:, 0] + _box_a[:, 2] / 2
        b1_y1, b1_y2 = _box_a[:, 1] - _box_a[:, 3] / 2, _box_a[:, 1] + _box_a[:, 3] / 2
        #   计算先验框获得的预测框的左上角和右下角
        b2_x1, b2_x2 = _box_b[:, 0] - _box_b[:, 2] / 2, _box_b[:, 0] + _box_b[:, 2] / 2
        b2_y1, b2_y2 = _box_b[:, 1] - _box_b[:, 3] / 2, _box_b[:, 1] + _box_b[:, 3] / 2

        #   将真实框和预测框都转化成左上角右下角的形式
        box_a = torch.zeros_like(_box_a)
        box_b = torch.zeros_like(_box_b)
        box_a[:, 0], box_a[:, 1], box_a[:, 2], box_a[:, 3] = b1_x1, b1_y1, b1_x2, b1_y2
        box_b[:, 0], box_b[:, 1], box_b[:, 2], box_b[:, 3] = b2_x1, b2_y1, b2_x2, b2_y2

        #   A为真实框的数量，B为先验框的数量
        A = box_a.size(0)
        B = box_b.size(0)

        #   计算交的面积
        max_xy  = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
        min_xy  = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), box_b[:, :2].unsqueeze(0).expand(A, B, 2))
        inter   = torch.clamp((max_xy - min_xy), min=0)
        inter   = inter[:, :, 0] * inter[:, :, 1]
        #   计算预测框和真实框各自的面积
        area_a = ((box_a[:, 2]-box_a[:, 0]) * (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
        area_b = ((box_b[:, 2]-box_b[:, 0]) * (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
        #   求IOU
        union = area_a + area_b - inter
        return inter / union  # [A,B]
    
    def get_target(self, l, targets, anchors, in_h, in_w):
        #   计算一共有多少张图片
        bs              = len(targets)
        #   用于选取哪些先验框不包含物体
        noobj_mask      = torch.ones(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad = False)
        #   构造训练标签：batch_size, 3, 13, 13, 5 + num_classes
        y_true          = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, self.bbox_attrs, requires_grad = False)
        for b in range(bs):            
            if len(targets[b])==0:
                continue
            batch_target = torch.zeros_like(targets[b])
            #   计算出正样本在特征层上的中心点
            batch_target[:, [0,2]] = targets[b][:, [0,2]] * in_w
            batch_target[:, [1,3]] = targets[b][:, [1,3]] * in_h
            batch_target[:, 4] = targets[b][:, 4]
            batch_target = batch_target.cpu()
            
            #   将真实框转换一个形式：num_true_box, 4
            gt_box          = torch.FloatTensor(torch.cat((torch.zeros((batch_target.size(0), 2)), batch_target[:, 2:4]), 1))
            #   将先验框转换一个形式：9, 4
            anchor_shapes   = torch.FloatTensor(torch.cat((torch.zeros((len(anchors), 2)), torch.FloatTensor(anchors)), 1))
            #   best_ns: [每个真实框最大的重合度max_iou, 每一个真实框最重合的先验框的序号]
            best_ns = torch.argmax(self.calculate_iou(gt_box, anchor_shapes), dim=-1)

            for t, best_n in enumerate(best_ns):
                if best_n not in self.anchors_mask[l]:
                    continue
                #   判断这个先验框是当前特征点的哪一个先验框
                k = self.anchors_mask[l].index(best_n)
                #   获得真实框属于哪个网格点
                i = torch.floor(batch_target[t, 0]).long()
                j = torch.floor(batch_target[t, 1]).long()
                #   取出真实框的种类
                c = batch_target[t, 4].long()
                
                #   noobj_mask代表无目标的特征点
                noobj_mask[b, k, j, i] = 0
                #   tx、ty代表中心调整参数的真实值
                y_true[b, k, j, i, 0] = batch_target[t, 0]
                y_true[b, k, j, i, 1] = batch_target[t, 1]
                y_true[b, k, j, i, 2] = batch_target[t, 2]
                y_true[b, k, j, i, 3] = batch_target[t, 3]
                y_true[b, k, j, i, 4] = 1
                y_true[b, k, j, i, c + 5] = 1
                
        return y_true, noobj_mask

    def get_ignore(self, l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask):
        #   计算一共有多少张图片
        bs = len(targets)

        #   生成网格，先验框中心，网格左上角
        grid_x = torch.linspace(0, in_w - 1, in_w).repeat(in_h, 1).repeat(
            int(bs * len(self.anchors_mask[l])), 1, 1).view(x.shape).type_as(x)
        grid_y = torch.linspace(0, in_h - 1, in_h).repeat(in_w, 1).t().repeat(
            int(bs * len(self.anchors_mask[l])), 1, 1).view(y.shape).type_as(x)

        # 生成先验框的宽高
        scaled_anchors_l = np.array(scaled_anchors)[self.anchors_mask[l]]
        anchor_w = torch.Tensor(scaled_anchors_l).index_select(1, torch.LongTensor([0])).type_as(x)
        anchor_h = torch.Tensor(scaled_anchors_l).index_select(1, torch.LongTensor([1])).type_as(x)
        
        anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape)
        anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape)
        #   计算调整后的先验框中心与宽高
        pred_boxes_x    = torch.unsqueeze(x + grid_x, -1)
        pred_boxes_y    = torch.unsqueeze(y + grid_y, -1)
        pred_boxes_w    = torch.unsqueeze(torch.exp(w) * anchor_w, -1)
        pred_boxes_h    = torch.unsqueeze(torch.exp(h) * anchor_h, -1)
        pred_boxes      = torch.cat([pred_boxes_x, pred_boxes_y, pred_boxes_w, pred_boxes_h], dim = -1)
        
        for b in range(bs):           
            #   将预测结果转换形式：num_anchors, 4
            pred_boxes_for_ignore = pred_boxes[b].view(-1, 4)
            #   计算真实框num_true_box, 4，并把真实框转换成相对于特征层的大小
            if len(targets[b]) > 0:
                batch_target = torch.zeros_like(targets[b])
                #   计算出正样本在特征层上的中心点
                batch_target[:, [0,2]] = targets[b][:, [0,2]] * in_w
                batch_target[:, [1,3]] = targets[b][:, [1,3]] * in_h
                batch_target = batch_target[:, :4].type_as(x)
                #   计算交并比
                anch_ious = self.calculate_iou(batch_target, pred_boxes_for_ignore)
                #   每个先验框对应真实框的最大重合度
                anch_ious_max, _    = torch.max(anch_ious, dim = 0)
                anch_ious_max       = anch_ious_max.view(pred_boxes[b].size()[:3])
                noobj_mask[b][anch_ious_max > self.ignore_threshold] = 0
        return noobj_mask, pred_boxes

在训练时累计三个特征图上的损失：

yolo_loss    = YOLOLoss(anchors, num_classes, input_shape, anchors_mask, label_smoothing, focal_loss, focal_alpha, focal_gamma, iou_type)
optimizer.zero_grad()
outputs = model_train(images)
loss_value_all  = 0
for l in range(len(outputs)):
    loss_item = yolo_loss(l, outputs[l], targets)
    loss_value_all  += loss_item
loss_value = loss_value_all
loss_value.backward()
optimizer.step()

YOLOv4在训练时采用了余弦退火衰减法（Cosine annealing），学习率会先上升再下降，上升的时候使用线性上升，下降的时候模拟cos函数下降。执行多次。pytorch有现成的调用函数：

lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=1e-5)

3. YOLOv4的其他改进

（1）数据增强：Mosaic

Mosaic对四张训练图片进行融合，能丰富检测物体的背景，提高batch size（相当于增大四倍），在计算BN统计量时分别从四张图像中计算，使BN更稳定。

每次读取四张图片；
分别对四张图片进行翻转、缩放、色域变化等，并且按照左上、左下、右上、右下四个方向位置摆好；
进行图片的组合和框的组合.

    def merge_bboxes(self, bboxes, cutx, cuty):
        merge_bbox = []
        for i in range(len(bboxes)):
            for box in bboxes[i]:
                tmp_box = []
                x1, y1, x2, y2 = box[0], box[1], box[2], box[3]

                if i == 0:
                    if y1 > cuty or x1 > cutx:
                        continue
                    if y2 >= cuty and y1 <= cuty:
                        y2 = cuty
                    if x2 >= cutx and x1 <= cutx:
                        x2 = cutx

                if i == 1:
                    if y2 < cuty or x1 > cutx:
                        continue
                    if y2 >= cuty and y1 <= cuty:
                        y1 = cuty
                    if x2 >= cutx and x1 <= cutx:
                        x2 = cutx

                if i == 2:
                    if y2 < cuty or x2 < cutx:
                        continue
                    if y2 >= cuty and y1 <= cuty:
                        y1 = cuty
                    if x2 >= cutx and x1 <= cutx:
                        x1 = cutx

                if i == 3:
                    if y1 > cuty or x2 < cutx:
                        continue
                    if y2 >= cuty and y1 <= cuty:
                        y2 = cuty
                    if x2 >= cutx and x1 <= cutx:
                        x1 = cutx
                tmp_box.append(x1)
                tmp_box.append(y1)
                tmp_box.append(x2)
                tmp_box.append(y2)
                tmp_box.append(box[-1])
                merge_bbox.append(tmp_box)
        return merge_bbox

    def get_random_data_with_Mosaic(self, annotation_line, input_shape, jitter=0.3, hue=.1, sat=0.7, val=0.4):
        h, w = input_shape
        min_offset_x = self.rand(0.3, 0.7)
        min_offset_y = self.rand(0.3, 0.7)

        image_datas = [] 
        box_datas   = []
        index       = 0
        for line in annotation_line:
            #   读取图片
            line_content = line.split()
            image = Image.open(line_content[0])
            image = cvtColor(image)
            
            #   图片的大小
            iw, ih = image.size
            #   保存框的位置
            box = np.array([np.array(list(map(int,box.split(',')))) for box in line_content[1:]])
            
            #   是否翻转图片
            flip = self.rand()<.5
            if flip and len(box)>0:
                image = image.transpose(Image.FLIP_LEFT_RIGHT)
                box[:, [0,2]] = iw - box[:, [2,0]]

            #   对图像进行缩放并且进行长和宽的扭曲
            new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)
            scale = self.rand(.4, 1)
            if new_ar < 1:
                nh = int(scale*h)
                nw = int(nh*new_ar)
            else:
                nw = int(scale*w)
                nh = int(nw/new_ar)
            image = image.resize((nw, nh), Image.BICUBIC)

            #   将图片进行放置，分别对应四张分割图片的位置
            if index == 0:
                dx = int(w*min_offset_x) - nw
                dy = int(h*min_offset_y) - nh
            elif index == 1:
                dx = int(w*min_offset_x) - nw
                dy = int(h*min_offset_y)
            elif index == 2:
                dx = int(w*min_offset_x)
                dy = int(h*min_offset_y)
            elif index == 3:
                dx = int(w*min_offset_x)
                dy = int(h*min_offset_y) - nh
            
            new_image = Image.new('RGB', (w,h), (128,128,128))
            new_image.paste(image, (dx, dy))
            image_data = np.array(new_image)

            index = index + 1
            box_data = []
            #   对box进行重新处理
            if len(box)>0:
                np.random.shuffle(box)
                box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
                box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
                box[:, 0:2][box[:, 0:2]<0] = 0
                box[:, 2][box[:, 2]>w] = w
                box[:, 3][box[:, 3]>h] = h
                box_w = box[:, 2] - box[:, 0]
                box_h = box[:, 3] - box[:, 1]
                box = box[np.logical_and(box_w>1, box_h>1)]
                box_data = np.zeros((len(box),5))
                box_data[:len(box)] = box
            
            image_datas.append(image_data)
            box_datas.append(box_data)

        #   将图片分割，放在一起
        cutx = int(w * min_offset_x)
        cuty = int(h * min_offset_y)

        new_image = np.zeros([h, w, 3])
        new_image[:cuty, :cutx, :] = image_datas[0][:cuty, :cutx, :]
        new_image[cuty:, :cutx, :] = image_datas[1][cuty:, :cutx, :]
        new_image[cuty:, cutx:, :] = image_datas[2][cuty:, cutx:, :]
        new_image[:cuty, cutx:, :] = image_datas[3][:cuty, cutx:, :]

        new_image       = np.array(new_image, np.uint8)

        #   对框进行进一步的处理
        new_boxes = self.merge_bboxes(box_datas, cutx, cuty)
        return new_image, new_boxes

（2）Self-Adversarial Training (SAT)

在第一阶段，网络对原始图像进行对抗攻击，通过生成对抗噪声使得网络检测不到图像中的目标；
在第二阶段，训练网络以正常方式检测该修改图像上的目标。