YOLOv3: An Incremental Improvement - 郑之杰的个人网站

YOLOv3：对YOLOv2模型的增量式改进.

paper：YOLOv3: An Incremental Improvement

YOLOv3在YOLOv2的基础上使用了大量同时期的模型训练技巧，通过对照和消融实现选择能够最大程度提升检测性能的一些方法。

对置信度得分应用逻辑回归：YOLO和YOLOv2在构造每个预测框置信度得分的损失函数时使用平方误差，而YOLOv3采用逻辑回归（二元交叉熵损失）；
类别预测时不使用softmax：YOLOv3在对每组anchor所在的网格进行类别预测时，采用多个独立的逻辑回归分类器对每个类别进行分类，而不使用softmax。这一改进考虑到一个图像可能有多个标签，并且并非所有标签都保证是互斥的。
改进卷积网络Darknet + ResNet：YOLOv3使用Darknet-53网络作为特征提取器，该网络由一系列$3\times 3$卷积和$1\times 1$卷积组成，并且引入了残差模块。
多尺度预测：受图像金字塔（如SSD网络）启发，YOLOv3在特征提取器的多个不同尺度的特征映射进行anchor设置与检测。
特征金字塔：YOLOv3构造了早期细粒度高的特征图与输出特征图之间的跨层连接。模型首先上采样细粒度低的特征图，然后将它们与之前的特征连接起来，通过传递细粒度信息检测小目标。

YOLOv3的完整PyTorch实现可参考 yolov3-pytorch。

1. YOLOv3的网络结构

YOLOv3的特征提取部分采用Darknet-53网络：

class ResidualBlock(nn.Module):
    def __init__(self, inplanes, planes):
        super(ResidualBlock, self).__init__()
        self.conv1  = nn.Conv2d(inplanes, planes[0], kernel_size=1, stride=1, padding=0, bias=False)
        self.bn1    = nn.BatchNorm2d(planes[0])
        self.relu1  = nn.LeakyReLU(0.1)
        
        self.conv2  = nn.Conv2d(planes[0], planes[1], kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2    = nn.BatchNorm2d(planes[1])
        self.relu2  = nn.LeakyReLU(0.1)

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu1(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu2(out)

        out += residual
        return out

class DarkNet(nn.Module):
    def __init__(self, layers):
        super(DarkNet, self).__init__()
        self.inplanes = 32
        # 416,416,3 -> 416,416,32
        self.conv1  = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1    = nn.BatchNorm2d(self.inplanes)
        self.relu1  = nn.LeakyReLU(0.1)

        # 416,416,32 -> 208,208,64
        self.layer1 = self._make_layer([32, 64], layers[0])
        # 208,208,64 -> 104,104,128
        self.layer2 = self._make_layer([64, 128], layers[1])
        # 104,104,128 -> 52,52,256
        self.layer3 = self._make_layer([128, 256], layers[2])
        # 52,52,256 -> 26,26,512
        self.layer4 = self._make_layer([256, 512], layers[3])
        # 26,26,512 -> 13,13,1024
        self.layer5 = self._make_layer([512, 1024], layers[4])

        self.layers_out_filters = [64, 128, 256, 512, 1024]

        # 进行权值初始化
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, planes, blocks):
        layers = []
        # 下采样，步长为2，卷积核大小为3
        layers.append(("ds_conv", nn.Conv2d(self.inplanes, planes[1], kernel_size=3, stride=2, padding=1, bias=False)))
        layers.append(("ds_bn", nn.BatchNorm2d(planes[1])))
        layers.append(("ds_relu", nn.LeakyReLU(0.1)))
        # 加入残差结构
        self.inplanes = planes[1]
        for i in range(0, blocks):
            layers.append(("residual_{}".format(i), ResidualBlock(self.inplanes, planes)))
        return nn.Sequential(OrderedDict(layers))

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.layer1(x)
        x = self.layer2(x)
        out3 = self.layer3(x)
        out4 = self.layer4(out3)
        out5 = self.layer5(out4)
        return out3, out4, out5

def darknet53():
    model = DarkNet([1, 2, 8, 8, 4])
    return model

YOLOv3对Darknet-53网络提取的三层特征（尺寸分别为$52\times 52\times 256, 26\times 26\times 512, 13\times 13\times 1024$）构造特征金字塔网络。对每个尺度的特征，在每个特征位置设置$3$个anchor，并为每个anchor预测$4$个边界框偏移量、$1$个置信度和$20$个类别（对应VOC数据集，若是COCO数据集则为$80$）的概率。因此每个特征位置对应的输出特征维度是$3\times(4+1+20)=75$。

def conv2d(filter_in, filter_out, kernel_size):
    pad = (kernel_size - 1) // 2 if kernel_size else 0
    return nn.Sequential(OrderedDict([
        ("conv", nn.Conv2d(filter_in, filter_out, kernel_size=kernel_size, stride=1, padding=pad, bias=False)),
        ("bn", nn.BatchNorm2d(filter_out)),
        ("relu", nn.LeakyReLU(0.1)),
    ]))

#   make_last_layers里面一共有七个卷积，前五个用于提取特征。
#   后两个用于获得yolo网络的预测结果
def make_last_layers(filters_list, in_filters, out_filter):
    m = nn.Sequential(
        conv2d(in_filters, filters_list[0], 1),
        conv2d(filters_list[0], filters_list[1], 3),
        conv2d(filters_list[1], filters_list[0], 1),
        conv2d(filters_list[0], filters_list[1], 3),
        conv2d(filters_list[1], filters_list[0], 1),
        conv2d(filters_list[0], filters_list[1], 3),
        nn.Conv2d(filters_list[1], out_filter, kernel_size=1, stride=1, padding=0, bias=True)
    )
    return m

class YoloBody(nn.Module):
    def __init__(self, anchors_mask, num_classes, pretrained = False):
        super(YoloBody, self).__init__() 
        #   darknet53主干模型的三个有效特征层
        #   shape分别是：52,52,256、26,26,512、13,13,1024
        self.backbone = darknet53()
        if pretrained:
            self.backbone.load_state_dict(torch.load("model_data/darknet53_backbone_weights.pth"))

        #   out_filters : [64, 128, 256, 512, 1024]
        out_filters = self.backbone.layers_out_filters

        #   计算yolo_head的输出通道数，对于voc数据集而言
        #   final_out_filter0 = final_out_filter1 = final_out_filter2 = 75
        self.last_layer0            = make_last_layers([512, 1024], out_filters[-1], len(anchors_mask[0]) * (num_classes + 5))

        self.last_layer1_conv       = conv2d(512, 256, 1)
        self.last_layer1_upsample   = nn.Upsample(scale_factor=2, mode='nearest')
        self.last_layer1            = make_last_layers([256, 512], out_filters[-2] + 256, len(anchors_mask[1]) * (num_classes + 5))

        self.last_layer2_conv       = conv2d(256, 128, 1)
        self.last_layer2_upsample   = nn.Upsample(scale_factor=2, mode='nearest')
        self.last_layer2            = make_last_layers([128, 256], out_filters[-3] + 128, len(anchors_mask[2]) * (num_classes + 5))

    def forward(self, x):
        #   获得三个有效特征层，shape分别是：52,52,256；26,26,512；13,13,1024
        x2, x1, x0 = self.backbone(x)

        #   第一个特征层 out0 = (batch_size,75,13,13)
        # 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512
        out0_branch = self.last_layer0[:5](x0)
        # 13,13,512 -> 13,13,1024 -> 13,13,75
        out0        = self.last_layer0[5:](out0_branch)

        # 13,13,512 -> 13,13,256 -> 26,26,256
        x1_in = self.last_layer1_conv(out0_branch)
        x1_in = self.last_layer1_upsample(x1_in)

        # 26,26,256 + 26,26,512 -> 26,26,768
        x1_in = torch.cat([x1_in, x1], 1)
        
        #   第二个特征层 out1 = (batch_size,75,26,26)
        # 26,26,768 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
        out1_branch = self.last_layer1[:5](x1_in)
        # 26,26,256 -> 26,26,512 -> 26,26,75
        out1        = self.last_layer1[5:](out1_branch)

        # 26,26,256 -> 26,26,128 -> 52,52,128
        x2_in = self.last_layer2_conv(out1_branch)
        x2_in = self.last_layer2_upsample(x2_in)

        # 52,52,128 + 52,52,256 -> 52,52,384
        x2_in = torch.cat([x2_in, x2], 1)
        
        #   第三个特征层 out3 = (batch_size,75,52,52)
        # 52,52,384 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,75
        out2 = self.last_layer2(x2_in)
        return out0, out1, out2

2. YOLOv3的anchor设置

YOLOv3通过k-means算法在数据集中生成了$k=9$个尺寸的anchor，分别分配到三个尺度的特征映射中。较大的框分配到小尺寸特征映射上，用以检测大目标；较小的框分配到大尺寸特征映射上，用以检测小目标。在COCO数据集上，通过k-means得到的$9$个框的尺寸为：

10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326

def cas_iou(box, cluster):
    x = np.minimum(cluster[:, 0], box[0])
    y = np.minimum(cluster[:, 1], box[1])
    intersection = x * y
    area1 = box[0] * box[1]
    area2 = cluster[:,0] * cluster[:,1]
    iou = intersection / (area1 + area2 - intersection)
    return iou

def avg_iou(box, cluster):
    return np.mean([np.max(cas_iou(box[i], cluster)) for i in range(box.shape[0])])

def kmeans(box, k):
    #   取出一共有多少框
    row = box.shape[0]
    #   每个框各个点的位置
    distance = np.empty((row, k))
    #   最后的聚类位置
    last_clu = np.zeros((row, ))

    np.random.seed()

    #   随机选k个当聚类中心
    cluster = box[np.random.choice(row, k, replace = False)]

    iter = 0
    while True:
        #   计算当前框和聚类中心的距离（1-IoU）
        for i in range(row):
            distance[i] = 1 - cas_iou(box[i], cluster)
        
        #   计算当前框指向的聚类中心
        near = np.argmin(distance, axis=1)
        if (last_clu == near).all():
            break
        
        #   求每一个类的中位点
        for j in range(k):
            cluster[j] = np.median(
                box[near == j],axis=0)

        last_clu = near
        if iter % 5 == 0:
            print('iter: {:d}. avg_iou:{:.2f}'.format(iter, avg_iou(box, cluster)))
        iter += 1
    return cluster, near

import glob
import xml.etree.ElementTree as ET
def load_data(path):
    data = []
    #   对于每一个xml都寻找box
    for xml_file in tqdm(glob.glob('{}/*xml'.format(path))):
        tree    = ET.parse(xml_file)
        height  = int(tree.findtext('./size/height'))
        width   = int(tree.findtext('./size/width'))
        if height<=0 or width<=0:
            continue
        
        #   对于每一个目标都获得它的宽高
        for obj in tree.iter('object'):
            xmin = int(float(obj.findtext('bndbox/xmin'))) / width
            ymin = int(float(obj.findtext('bndbox/ymin'))) / height
            xmax = int(float(obj.findtext('bndbox/xmax'))) / width
            ymax = int(float(obj.findtext('bndbox/ymax'))) / height

            xmin = np.float64(xmin)
            ymin = np.float64(ymin)
            xmax = np.float64(xmax)
            ymax = np.float64(ymax)
            # 得到宽高
            data.append([xmax - xmin, ymax - ymin])
    return np.array(data)

if __name__ == '__main__':
    np.random.seed(0)
    #   运行该程序会计算'./VOCdevkit/VOC2007/Annotations'的xml
    #   会生成yolo_anchors.txt
    input_shape = [416, 416]
    anchors_num = 9
    
    #   载入数据集，可以使用VOC的xml
    path        = 'VOCdevkit/VOC2007/Annotations'
    
    #   载入所有的xml，存储格式为转化为比例后的width,height
    print('Load xmls.')
    data = load_data(path)
    print('Load xmls done.')
    
    #   使用k聚类算法
    print('K-means boxes.')
    cluster, near   = kmeans(data, anchors_num)
    print('K-means boxes done.')
    data            = data * np.array([input_shape[1], input_shape[0]])
    cluster         = cluster * np.array([input_shape[1], input_shape[0]])

    #   绘图
    for j in range(anchors_num):
        plt.scatter(data[near == j][:,0], data[near == j][:,1])
        plt.scatter(cluster[j][0], cluster[j][1], marker='x', c='black')
    plt.savefig("kmeans_for_anchors.jpg")
    plt.show()
    print('Save kmeans_for_anchors.jpg in root dir.')

    cluster = cluster[np.argsort(cluster[:, 0] * cluster[:, 1])]
    print('avg_ratio:{:.2f}'.format(avg_iou(data, cluster)))
    print(cluster)

    f = open("yolo_anchors.txt", 'w')
    row = np.shape(cluster)[0]
    for i in range(row):
        if i == 0:
            x_y = "%d,%d" % (cluster[i][0], cluster[i][1])
        else:
            x_y = ", %d,%d" % (cluster[i][0], cluster[i][1])
        f.write(x_y)
    f.close()

3. YOLOv3的损失函数

YOLOv3的损失函数为：

\[\begin{aligned} \mathcal{L}_{\mathrm{loc}} & =\lambda_{\text {coord }} \sum_{i=0}^{S^2} \sum_{j=0}^B \mathbb{1}_{i j}^{\mathrm{obj}}\left[\text{GIoU loss}\left((x_i,y_i,w_i,h_i),(\hat{x}_i,\hat{y}_i,\hat{w}_i,\hat{h}_i)\right)\right] \\ \mathcal{L}_{\mathrm{cls}} & =\sum_{i=0}^{S^2} \sum_{j=0}^B\left(\mathbb{1}_{i j}^{\mathrm{obj}}+\lambda_{\text {noobj }}\left(1-\mathbb{1}_{i j}^{\mathrm{obj}}\right)\right)\left(-C_{i j}\log \hat{C}_{i j}-(1-C_{i j})\log (1-\hat{C}_{i j})\right) \\ &+\sum_{i=0}^{S^2} \sum_{c \in \mathcal{C}} \mathbb{1}_i^{\mathrm{obj}}\left(-p_i(c)\log\hat{p}_i(c)-(1-p_i(c))\log(1-\hat{p}_i(c))\right) \\ \mathcal{L} & =\mathcal{L}_{\mathrm{loc}}+\mathcal{L}_{\mathrm{cls}} \end{aligned}\]

其中边界框回归损失选用GIoU损失，用$A=(x_i,y_i,w_i,h_i),B(\hat{x}_i,\hat{y}_i,\hat{w}_i,\hat{h}_i)$分别表示检测框和目标框，$C$表示两者的外接矩形，则GIoU损失计算为：

\[\begin{aligned} \text{GIoU loss} = 1- \text{GIoU} = 1-\frac{|A| ∩ |B|}{|A| ∪ |B|} + \frac{|C - |A| ∪ |B||}{|C|} \end{aligned}\]

    def box_giou(self, b1, b2):
        """
        输入：
        b1: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh
        b2: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh

        返回：
        giou: tensor, shape=(batch, feat_w, feat_h, anchor_num, 1)
        """
        #   求出预测框左上角右下角
        b1_xy       = b1[..., :2]
        b1_wh       = b1[..., 2:4]
        b1_wh_half  = b1_wh/2.
        b1_mins     = b1_xy - b1_wh_half
        b1_maxes    = b1_xy + b1_wh_half
        
        #   求出真实框左上角右下角
        b2_xy       = b2[..., :2]
        b2_wh       = b2[..., 2:4]
        b2_wh_half  = b2_wh/2.
        b2_mins     = b2_xy - b2_wh_half
        b2_maxes    = b2_xy + b2_wh_half

        #   求真实框和预测框所有的iou
        intersect_mins  = torch.max(b1_mins, b2_mins)
        intersect_maxes = torch.min(b1_maxes, b2_maxes)
        intersect_wh    = torch.max(intersect_maxes - intersect_mins, torch.zeros_like(intersect_maxes))
        intersect_area  = intersect_wh[..., 0] * intersect_wh[..., 1]
        b1_area         = b1_wh[..., 0] * b1_wh[..., 1]
        b2_area         = b2_wh[..., 0] * b2_wh[..., 1]
        union_area      = b1_area + b2_area - intersect_area
        iou             = intersect_area / union_area

        #   找到真实框和预测框的外接矩形
        enclose_mins    = torch.min(b1_mins, b2_mins)
        enclose_maxes   = torch.max(b1_maxes, b2_maxes)
        enclose_wh      = torch.max(enclose_maxes - enclose_mins, torch.zeros_like(intersect_maxes))
        
        #   计算GIoU
        enclose_area    = enclose_wh[..., 0] * enclose_wh[..., 1]
        giou            = iou - (enclose_area - union_area) / enclose_area
        return giou

置信度分类损失与类别分类损失均选用二元交叉熵损失。

    def clip_by_tensor(self, t, t_min, t_max):
        t = t.float()
        result = (t >= t_min).float() * t + (t < t_min).float() * t_min
        result = (result <= t_max).float() * result + (result > t_max).float() * t_max
        return result

    def BCELoss(self, pred, target):
        epsilon = 1e-7
        pred    = self.clip_by_tensor(pred, epsilon, 1.0 - epsilon)
        output  = - target * torch.log(pred) - (1.0 - target) * torch.log(1.0 - pred)
        return output

YOLOv3的总损失定义为：

class YOLOLoss(nn.Module):
    def __init__(self, num_classes, input_shape, anchors_mask = [[6,7,8], [3,4,5], [0,1,2]]):
        super(YOLOLoss, self).__init__()
        #   13x13的特征层对应的anchor是[116,90],[156,198],[373,326]
        #   26x26的特征层对应的anchor是[30,61],[62,45],[59,119]
        #   52x52的特征层对应的anchor是[10,13],[16,30],[33,23]
        self.anchors        = np.array([[10,13],[16,30],[33,23],[30,61],[62,45],[59,119],[116,90],[156,198],[373,326]])
        self.num_classes    = num_classes
        self.bbox_attrs     = 5 + num_classes
        self.input_shape    = input_shape
        self.anchors_mask   = anchors_mask

        self.balance        = [0.4, 1.0, 4]
        self.box_ratio      = 0.05
        self.obj_ratio      = 5 * (input_shape[0] * input_shape[1]) / (416 ** 2)
        self.cls_ratio      = 1 * (num_classes / 80)
        self.ignore_threshold = 0.5
        
    def forward(self, l, input, targets=None):
        #   l代表的是，当前输入进来的有效特征层，是第几个有效特征层
        #   input的shape为  bs, 3*(5+num_classes), 13, 13（l=0）
        #                   bs, 3*(5+num_classes), 26, 26（l=1）
        #                   bs, 3*(5+num_classes), 52, 52（l=2）
        #   targets代表的是归一化的真实框（中心位置+宽度+高度）。
        
        #   获得图片数量，特征层的高和宽
        bs      = input.size(0)
        in_h    = input.size(2)
        in_w    = input.size(3)
        
        #   计算步长
        #   每一个特征点对应原来的图片上多少个像素点
        #   如果特征层为13x13的话，一个特征点就对应原来的图片上的32个像素点
        #   如果特征层为26x26的话，一个特征点就对应原来的图片上的16个像素点
        #   如果特征层为52x52的话，一个特征点就对应原来的图片上的8个像素点
        stride_h = self.input_shape[0] / in_h
        stride_w = self.input_shape[1] / in_w
        
        #   此时获得的scaled_anchors大小是相对于特征层的
        scaled_anchors  = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in self.anchors]
        
        #   输入inputshape调整为：
        #   bs, 3*(5+num_classes), in_h, in_w => batch_size, 3, in_w, in_w, 5 + num_classes
        prediction = input.view(bs, len(self.anchors_mask[l]), self.bbox_attrs, in_h, in_w).permute(0, 1, 3, 4, 2).contiguous()
        
        
        x = torch.sigmoid(prediction[..., 0]) #   先验框的中心位置的调整参数
        y = torch.sigmoid(prediction[..., 1])
        w = prediction[..., 2] #   先验框的宽高调整参数
        h = prediction[..., 3]
        conf = torch.sigmoid(prediction[..., 4]) # 获得置信度，是否有物体
        pred_cls = torch.sigmoid(prediction[..., 5:]) # 类别概率

        #   获得网络预测结果的标签
        y_true, noobj_mask, box_loss_scale = self.get_target(l, targets, scaled_anchors, in_h, in_w)

        #   根据预测结果调整先验框
        noobj_mask, pred_boxes = self.get_ignore(l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask)

        #   box_loss_scale是真实框宽高的乘积，宽高均在0-1之间，因此乘积也在0-1之间。
        #   2-宽高的乘积代表真实框越大，比重越小，小框的比重更大。
        box_loss_scale = 2 - box_loss_scale
            
        loss        = 0
        obj_mask    = y_true[..., 4] == 1
        n           = torch.sum(obj_mask)
        if n != 0:
            #   计算预测结果和真实结果的giou
            giou        = self.box_giou(pred_boxes, y_true[..., :4]).type_as(x)
            loss_loc    = torch.mean((1 - giou)[obj_mask])

            loss_cls    = torch.mean(self.BCELoss(pred_cls[obj_mask], y_true[..., 5:][obj_mask]))
            loss        += loss_loc * self.box_ratio + loss_cls * self.cls_ratio

        loss_conf   = torch.mean(self.BCELoss(conf, obj_mask.type_as(conf))[noobj_mask.bool() | obj_mask])
        loss        += loss_conf * self.balance[l] * self.obj_ratio
        return loss

    def get_target(self, l, targets, anchors, in_h, in_w):
        #   计算一共有多少张图片
        bs              = len(targets)
        #   用于选取哪些先验框不包含物体
        noobj_mask      = torch.ones(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad = False)
        #   让网络更加去关注小目标
        box_loss_scale  = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad = False)
        #   标签尺寸：batch_size, 3, in_h, in_w, 5 + num_classes
        y_true          = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, self.bbox_attrs, requires_grad = False)
        for b in range(bs):            
            if len(targets[b])==0:
                continue
            batch_target = torch.zeros_like(targets[b])
            #   计算出正样本在特征层上的中心点和类别
            batch_target[:, [0,2]] = targets[b][:, [0,2]] * in_w
            batch_target[:, [1,3]] = targets[b][:, [1,3]] * in_h
            batch_target[:, 4] = targets[b][:, 4]
            batch_target = batch_target.cpu()
            
            gt_box          = torch.FloatTensor(batch_target[:, 0:4])
            anchor_shapes   = torch.FloatTensor(anchors)
            #   计算交并比[num_true_box, 9]每一个真实框和9个先验框的重合情况
            #   best_ns:[每个真实框最大的重合度max_iou, 每一个真实框最重合的先验框的序号]
            best_ns = torch.argmax(self.calculate_iou(gt_box, anchor_shapes), dim=-1)

            for t, best_n in enumerate(best_ns):
                if best_n not in self.anchors_mask[l]:
                    continue
                    
                #   判断这个先验框是当前特征点的哪一个先验框
                k = self.anchors_mask[l].index(best_n)
                #   获得真实框属于哪个网格点
                i = torch.floor(batch_target[t, 0]).long()
                j = torch.floor(batch_target[t, 1]).long()
                #   取出真实框的种类
                c = batch_target[t, 4].long()

                #   noobj_mask=1 代表无目标的特征点
                noobj_mask[b, k, j, i] = 0

                y_true[b, k, j, i, 0] = batch_target[t, 0]
                y_true[b, k, j, i, 1] = batch_target[t, 1]
                y_true[b, k, j, i, 2] = batch_target[t, 2]
                y_true[b, k, j, i, 3] = batch_target[t, 3]
                y_true[b, k, j, i, 4] = 1
                y_true[b, k, j, i, c + 5] = 1

                #   大目标loss权重小，小目标loss权重大
                box_loss_scale[b, k, j, i] = batch_target[t, 2] * batch_target[t, 3] / in_w / in_h
        return y_true, noobj_mask, box_loss_scale

    def calculate_iou(self, _box_a, _box_b):
        #   计算真实框的左上角和右下角
        b1_x1, b1_x2 = _box_a[:, 0] - _box_a[:, 2] / 2, _box_a[:, 0] + _box_a[:, 2] / 2
        b1_y1, b1_y2 = _box_a[:, 1] - _box_a[:, 3] / 2, _box_a[:, 1] + _box_a[:, 3] / 2
        #   计算先验框获得的预测框的左上角和右下角
        b2_x1, b2_x2 = _box_b[:, 0] - _box_b[:, 2] / 2, _box_b[:, 0] + _box_b[:, 2] / 2
        b2_y1, b2_y2 = _box_b[:, 1] - _box_b[:, 3] / 2, _box_b[:, 1] + _box_b[:, 3] / 2

        #   将真实框和预测框都转化成左上角右下角的形式
        box_a = torch.zeros_like(_box_a)
        box_b = torch.zeros_like(_box_b)
        box_a[:, 0], box_a[:, 1], box_a[:, 2], box_a[:, 3] = b1_x1, b1_y1, b1_x2, b1_y2
        box_b[:, 0], box_b[:, 1], box_b[:, 2], box_b[:, 3] = b2_x1, b2_y1, b2_x2, b2_y2

        #   A为真实框的数量，B为先验框的数量
        A = box_a.size(0)
        B = box_b.size(0)

        #   计算交的面积
        max_xy  = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
        min_xy  = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), box_b[:, :2].unsqueeze(0).expand(A, B, 2))
        inter   = torch.clamp((max_xy - min_xy), min=0)
        inter   = inter[:, :, 0] * inter[:, :, 1]
        #   计算预测框和真实框各自的面积
        area_a = ((box_a[:, 2]-box_a[:, 0]) * (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  
        area_b = ((box_b[:, 2]-box_b[:, 0]) * (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  
        #   求IOU
        union = area_a + area_b - inter
        return inter / union 
        
    def get_ignore(self, l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask):
        #   计算一共有多少张图片
        bs = len(targets)

        #   生成网格，先验框中心，网格左上角
        grid_x = torch.linspace(0, in_w - 1, in_w).repeat(in_h, 1).repeat(
            int(bs * len(self.anchors_mask[l])), 1, 1).view(x.shape).type_as(x)
        grid_y = torch.linspace(0, in_h - 1, in_h).repeat(in_w, 1).t().repeat(
            int(bs * len(self.anchors_mask[l])), 1, 1).view(y.shape).type_as(x)

        # 生成先验框的宽高
        scaled_anchors_l = np.array(scaled_anchors)[self.anchors_mask[l]]
        anchor_w = torch.Tensor(scaled_anchors_l).index_select(1, torch.LongTensor([0])).type_as(x)
        anchor_h = torch.Tensor(scaled_anchors_l).index_select(1, torch.LongTensor([1])).type_as(x)
        
        anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape)
        anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape)
        
        #   计算调整后的先验框中心与宽高
        pred_boxes_x    = torch.unsqueeze(x + grid_x, -1)
        pred_boxes_y    = torch.unsqueeze(y + grid_y, -1)
        pred_boxes_w    = torch.unsqueeze(torch.exp(w) * anchor_w, -1)
        pred_boxes_h    = torch.unsqueeze(torch.exp(h) * anchor_h, -1)
        pred_boxes      = torch.cat([pred_boxes_x, pred_boxes_y, pred_boxes_w, pred_boxes_h], dim = -1)
        
        for b in range(bs):           
            #   将预测结果转换为：pred_boxes_for_ignore  [num_anchors, 4]
            pred_boxes_for_ignore = pred_boxes[b].view(-1, 4)
            #   计算真实框gt_box，并把真实框转换成相对于特征层的大小[num_true_box, 4]
            if len(targets[b]) > 0:
                batch_target = torch.zeros_like(targets[b])
                #   计算出正样本在特征层上的中心点
                batch_target[:, [0,2]] = targets[b][:, [0,2]] * in_w
                batch_target[:, [1,3]] = targets[b][:, [1,3]] * in_h
                batch_target = batch_target[:, :4].type_as(x)
                #   计算交并比:anch_ious  [num_true_box, num_anchors]
                anch_ious = self.calculate_iou(batch_target, pred_boxes_for_ignore)
                #   每个先验框对应真实框的最大重合度:anch_ious_max [num_anchors]
                anch_ious_max, _    = torch.max(anch_ious, dim = 0)
                anch_ious_max       = anch_ious_max.view(pred_boxes[b].size()[:3])
                noobj_mask[b][anch_ious_max > self.ignore_threshold] = 0
        return noobj_mask, pred_boxes

在训练时累计三个特征图上的损失：

yolo_loss = YOLOLoss(num_classes, input_shape, anchors_mask)
optimizer.zero_grad()
outputs = model_train(images)
loss_value_all  = 0
for l in range(len(outputs)):
    loss_item = yolo_loss(l, outputs[l], targets)
    loss_value_all  += loss_item
loss_value = loss_value_all
loss_value.backward()
optimizer.step()