YOLOv3:对YOLOv2模型的增量式改进.
YOLOv3在YOLOv2的基础上使用了大量同时期的模型训练技巧,通过对照和消融实现选择能够最大程度提升检测性能的一些方法。
- 对置信度得分应用逻辑回归:YOLO和YOLOv2在构造每个预测框置信度得分的损失函数时使用平方误差,而YOLOv3采用逻辑回归(二元交叉熵损失);
- 类别预测时不使用softmax:YOLOv3在对每组anchor所在的网格进行类别预测时,采用多个独立的逻辑回归分类器对每个类别进行分类,而不使用softmax。这一改进考虑到一个图像可能有多个标签,并且并非所有标签都保证是互斥的。
- 改进卷积网络Darknet + ResNet:YOLOv3使用Darknet-53网络作为特征提取器,该网络由一系列$3\times 3$卷积和$1\times 1$卷积组成,并且引入了残差模块。
- 多尺度预测:受图像金字塔(如SSD网络)启发,YOLOv3在特征提取器的多个不同尺度的特征映射进行anchor设置与检测。
- 特征金字塔:YOLOv3构造了早期细粒度高的特征图与输出特征图之间的跨层连接。模型首先上采样细粒度低的特征图,然后将它们与之前的特征连接起来,通过传递细粒度信息检测小目标。
YOLOv3的完整PyTorch实现可参考 yolov3-pytorch。
1. YOLOv3的网络结构
YOLOv3的特征提取部分采用Darknet-53网络:
class ResidualBlock(nn.Module):
def __init__(self, inplanes, planes):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes[0], kernel_size=1, stride=1, padding=0, bias=False)
self.bn1 = nn.BatchNorm2d(planes[0])
self.relu1 = nn.LeakyReLU(0.1)
self.conv2 = nn.Conv2d(planes[0], planes[1], kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes[1])
self.relu2 = nn.LeakyReLU(0.1)
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu1(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu2(out)
out += residual
return out
class DarkNet(nn.Module):
def __init__(self, layers):
super(DarkNet, self).__init__()
self.inplanes = 32
# 416,416,3 -> 416,416,32
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(self.inplanes)
self.relu1 = nn.LeakyReLU(0.1)
# 416,416,32 -> 208,208,64
self.layer1 = self._make_layer([32, 64], layers[0])
# 208,208,64 -> 104,104,128
self.layer2 = self._make_layer([64, 128], layers[1])
# 104,104,128 -> 52,52,256
self.layer3 = self._make_layer([128, 256], layers[2])
# 52,52,256 -> 26,26,512
self.layer4 = self._make_layer([256, 512], layers[3])
# 26,26,512 -> 13,13,1024
self.layer5 = self._make_layer([512, 1024], layers[4])
self.layers_out_filters = [64, 128, 256, 512, 1024]
# 进行权值初始化
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, planes, blocks):
layers = []
# 下采样,步长为2,卷积核大小为3
layers.append(("ds_conv", nn.Conv2d(self.inplanes, planes[1], kernel_size=3, stride=2, padding=1, bias=False)))
layers.append(("ds_bn", nn.BatchNorm2d(planes[1])))
layers.append(("ds_relu", nn.LeakyReLU(0.1)))
# 加入残差结构
self.inplanes = planes[1]
for i in range(0, blocks):
layers.append(("residual_{}".format(i), ResidualBlock(self.inplanes, planes)))
return nn.Sequential(OrderedDict(layers))
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.layer1(x)
x = self.layer2(x)
out3 = self.layer3(x)
out4 = self.layer4(out3)
out5 = self.layer5(out4)
return out3, out4, out5
def darknet53():
model = DarkNet([1, 2, 8, 8, 4])
return model
YOLOv3对Darknet-53网络提取的三层特征(尺寸分别为$52\times 52\times 256, 26\times 26\times 512, 13\times 13\times 1024$)构造特征金字塔网络。对每个尺度的特征,在每个特征位置设置$3$个anchor,并为每个anchor预测$4$个边界框偏移量、$1$个置信度和$20$个类别(对应VOC数据集,若是COCO数据集则为$80$)的概率。因此每个特征位置对应的输出特征维度是$3\times(4+1+20)=75$。
def conv2d(filter_in, filter_out, kernel_size):
pad = (kernel_size - 1) // 2 if kernel_size else 0
return nn.Sequential(OrderedDict([
("conv", nn.Conv2d(filter_in, filter_out, kernel_size=kernel_size, stride=1, padding=pad, bias=False)),
("bn", nn.BatchNorm2d(filter_out)),
("relu", nn.LeakyReLU(0.1)),
]))
# make_last_layers里面一共有七个卷积,前五个用于提取特征。
# 后两个用于获得yolo网络的预测结果
def make_last_layers(filters_list, in_filters, out_filter):
m = nn.Sequential(
conv2d(in_filters, filters_list[0], 1),
conv2d(filters_list[0], filters_list[1], 3),
conv2d(filters_list[1], filters_list[0], 1),
conv2d(filters_list[0], filters_list[1], 3),
conv2d(filters_list[1], filters_list[0], 1),
conv2d(filters_list[0], filters_list[1], 3),
nn.Conv2d(filters_list[1], out_filter, kernel_size=1, stride=1, padding=0, bias=True)
)
return m
class YoloBody(nn.Module):
def __init__(self, anchors_mask, num_classes, pretrained = False):
super(YoloBody, self).__init__()
# darknet53主干模型的三个有效特征层
# shape分别是:52,52,256、26,26,512、13,13,1024
self.backbone = darknet53()
if pretrained:
self.backbone.load_state_dict(torch.load("model_data/darknet53_backbone_weights.pth"))
# out_filters : [64, 128, 256, 512, 1024]
out_filters = self.backbone.layers_out_filters
# 计算yolo_head的输出通道数,对于voc数据集而言
# final_out_filter0 = final_out_filter1 = final_out_filter2 = 75
self.last_layer0 = make_last_layers([512, 1024], out_filters[-1], len(anchors_mask[0]) * (num_classes + 5))
self.last_layer1_conv = conv2d(512, 256, 1)
self.last_layer1_upsample = nn.Upsample(scale_factor=2, mode='nearest')
self.last_layer1 = make_last_layers([256, 512], out_filters[-2] + 256, len(anchors_mask[1]) * (num_classes + 5))
self.last_layer2_conv = conv2d(256, 128, 1)
self.last_layer2_upsample = nn.Upsample(scale_factor=2, mode='nearest')
self.last_layer2 = make_last_layers([128, 256], out_filters[-3] + 128, len(anchors_mask[2]) * (num_classes + 5))
def forward(self, x):
# 获得三个有效特征层,shape分别是:52,52,256;26,26,512;13,13,1024
x2, x1, x0 = self.backbone(x)
# 第一个特征层 out0 = (batch_size,75,13,13)
# 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512
out0_branch = self.last_layer0[:5](x0)
# 13,13,512 -> 13,13,1024 -> 13,13,75
out0 = self.last_layer0[5:](out0_branch)
# 13,13,512 -> 13,13,256 -> 26,26,256
x1_in = self.last_layer1_conv(out0_branch)
x1_in = self.last_layer1_upsample(x1_in)
# 26,26,256 + 26,26,512 -> 26,26,768
x1_in = torch.cat([x1_in, x1], 1)
# 第二个特征层 out1 = (batch_size,75,26,26)
# 26,26,768 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
out1_branch = self.last_layer1[:5](x1_in)
# 26,26,256 -> 26,26,512 -> 26,26,75
out1 = self.last_layer1[5:](out1_branch)
# 26,26,256 -> 26,26,128 -> 52,52,128
x2_in = self.last_layer2_conv(out1_branch)
x2_in = self.last_layer2_upsample(x2_in)
# 52,52,128 + 52,52,256 -> 52,52,384
x2_in = torch.cat([x2_in, x2], 1)
# 第三个特征层 out3 = (batch_size,75,52,52)
# 52,52,384 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,75
out2 = self.last_layer2(x2_in)
return out0, out1, out2
2. YOLOv3的anchor设置
YOLOv3通过k-means算法在数据集中生成了$k=9$个尺寸的anchor,分别分配到三个尺度的特征映射中。较大的框分配到小尺寸特征映射上,用以检测大目标;较小的框分配到大尺寸特征映射上,用以检测小目标。在COCO数据集上,通过k-means得到的$9$个框的尺寸为:
10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
def cas_iou(box, cluster):
x = np.minimum(cluster[:, 0], box[0])
y = np.minimum(cluster[:, 1], box[1])
intersection = x * y
area1 = box[0] * box[1]
area2 = cluster[:,0] * cluster[:,1]
iou = intersection / (area1 + area2 - intersection)
return iou
def avg_iou(box, cluster):
return np.mean([np.max(cas_iou(box[i], cluster)) for i in range(box.shape[0])])
def kmeans(box, k):
# 取出一共有多少框
row = box.shape[0]
# 每个框各个点的位置
distance = np.empty((row, k))
# 最后的聚类位置
last_clu = np.zeros((row, ))
np.random.seed()
# 随机选k个当聚类中心
cluster = box[np.random.choice(row, k, replace = False)]
iter = 0
while True:
# 计算当前框和聚类中心的距离(1-IoU)
for i in range(row):
distance[i] = 1 - cas_iou(box[i], cluster)
# 计算当前框指向的聚类中心
near = np.argmin(distance, axis=1)
if (last_clu == near).all():
break
# 求每一个类的中位点
for j in range(k):
cluster[j] = np.median(
box[near == j],axis=0)
last_clu = near
if iter % 5 == 0:
print('iter: {:d}. avg_iou:{:.2f}'.format(iter, avg_iou(box, cluster)))
iter += 1
return cluster, near
import glob
import xml.etree.ElementTree as ET
def load_data(path):
data = []
# 对于每一个xml都寻找box
for xml_file in tqdm(glob.glob('{}/*xml'.format(path))):
tree = ET.parse(xml_file)
height = int(tree.findtext('./size/height'))
width = int(tree.findtext('./size/width'))
if height<=0 or width<=0:
continue
# 对于每一个目标都获得它的宽高
for obj in tree.iter('object'):
xmin = int(float(obj.findtext('bndbox/xmin'))) / width
ymin = int(float(obj.findtext('bndbox/ymin'))) / height
xmax = int(float(obj.findtext('bndbox/xmax'))) / width
ymax = int(float(obj.findtext('bndbox/ymax'))) / height
xmin = np.float64(xmin)
ymin = np.float64(ymin)
xmax = np.float64(xmax)
ymax = np.float64(ymax)
# 得到宽高
data.append([xmax - xmin, ymax - ymin])
return np.array(data)
if __name__ == '__main__':
np.random.seed(0)
# 运行该程序会计算'./VOCdevkit/VOC2007/Annotations'的xml
# 会生成yolo_anchors.txt
input_shape = [416, 416]
anchors_num = 9
# 载入数据集,可以使用VOC的xml
path = 'VOCdevkit/VOC2007/Annotations'
# 载入所有的xml,存储格式为转化为比例后的width,height
print('Load xmls.')
data = load_data(path)
print('Load xmls done.')
# 使用k聚类算法
print('K-means boxes.')
cluster, near = kmeans(data, anchors_num)
print('K-means boxes done.')
data = data * np.array([input_shape[1], input_shape[0]])
cluster = cluster * np.array([input_shape[1], input_shape[0]])
# 绘图
for j in range(anchors_num):
plt.scatter(data[near == j][:,0], data[near == j][:,1])
plt.scatter(cluster[j][0], cluster[j][1], marker='x', c='black')
plt.savefig("kmeans_for_anchors.jpg")
plt.show()
print('Save kmeans_for_anchors.jpg in root dir.')
cluster = cluster[np.argsort(cluster[:, 0] * cluster[:, 1])]
print('avg_ratio:{:.2f}'.format(avg_iou(data, cluster)))
print(cluster)
f = open("yolo_anchors.txt", 'w')
row = np.shape(cluster)[0]
for i in range(row):
if i == 0:
x_y = "%d,%d" % (cluster[i][0], cluster[i][1])
else:
x_y = ", %d,%d" % (cluster[i][0], cluster[i][1])
f.write(x_y)
f.close()
3. YOLOv3的损失函数
YOLOv3的损失函数为:
\[\begin{aligned} \mathcal{L}_{\mathrm{loc}} & =\lambda_{\text {coord }} \sum_{i=0}^{S^2} \sum_{j=0}^B \mathbb{1}_{i j}^{\mathrm{obj}}\left[\text{GIoU loss}\left((x_i,y_i,w_i,h_i),(\hat{x}_i,\hat{y}_i,\hat{w}_i,\hat{h}_i)\right)\right] \\ \mathcal{L}_{\mathrm{cls}} & =\sum_{i=0}^{S^2} \sum_{j=0}^B\left(\mathbb{1}_{i j}^{\mathrm{obj}}+\lambda_{\text {noobj }}\left(1-\mathbb{1}_{i j}^{\mathrm{obj}}\right)\right)\left(-C_{i j}\log \hat{C}_{i j}-(1-C_{i j})\log (1-\hat{C}_{i j})\right) \\ &+\sum_{i=0}^{S^2} \sum_{c \in \mathcal{C}} \mathbb{1}_i^{\mathrm{obj}}\left(-p_i(c)\log\hat{p}_i(c)-(1-p_i(c))\log(1-\hat{p}_i(c))\right) \\ \mathcal{L} & =\mathcal{L}_{\mathrm{loc}}+\mathcal{L}_{\mathrm{cls}} \end{aligned}\]其中边界框回归损失选用GIoU损失,用\(A=(x_i,y_i,w_i,h_i),B(\hat{x}_i,\hat{y}_i,\hat{w}_i,\hat{h}_i)\)分别表示检测框和目标框,$C$表示两者的外接矩形,则GIoU损失计算为:
\[\begin{aligned} \text{GIoU loss} = 1- \text{GIoU} = 1-\frac{|A| ∩ |B|}{|A| ∪ |B|} + \frac{|C - |A| ∪ |B||}{|C|} \end{aligned}\] def box_giou(self, b1, b2):
"""
输入:
b1: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh
b2: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh
返回:
giou: tensor, shape=(batch, feat_w, feat_h, anchor_num, 1)
"""
# 求出预测框左上角右下角
b1_xy = b1[..., :2]
b1_wh = b1[..., 2:4]
b1_wh_half = b1_wh/2.
b1_mins = b1_xy - b1_wh_half
b1_maxes = b1_xy + b1_wh_half
# 求出真实框左上角右下角
b2_xy = b2[..., :2]
b2_wh = b2[..., 2:4]
b2_wh_half = b2_wh/2.
b2_mins = b2_xy - b2_wh_half
b2_maxes = b2_xy + b2_wh_half
# 求真实框和预测框所有的iou
intersect_mins = torch.max(b1_mins, b2_mins)
intersect_maxes = torch.min(b1_maxes, b2_maxes)
intersect_wh = torch.max(intersect_maxes - intersect_mins, torch.zeros_like(intersect_maxes))
intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
b1_area = b1_wh[..., 0] * b1_wh[..., 1]
b2_area = b2_wh[..., 0] * b2_wh[..., 1]
union_area = b1_area + b2_area - intersect_area
iou = intersect_area / union_area
# 找到真实框和预测框的外接矩形
enclose_mins = torch.min(b1_mins, b2_mins)
enclose_maxes = torch.max(b1_maxes, b2_maxes)
enclose_wh = torch.max(enclose_maxes - enclose_mins, torch.zeros_like(intersect_maxes))
# 计算GIoU
enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
giou = iou - (enclose_area - union_area) / enclose_area
return giou
置信度分类损失与类别分类损失均选用二元交叉熵损失。
def clip_by_tensor(self, t, t_min, t_max):
t = t.float()
result = (t >= t_min).float() * t + (t < t_min).float() * t_min
result = (result <= t_max).float() * result + (result > t_max).float() * t_max
return result
def BCELoss(self, pred, target):
epsilon = 1e-7
pred = self.clip_by_tensor(pred, epsilon, 1.0 - epsilon)
output = - target * torch.log(pred) - (1.0 - target) * torch.log(1.0 - pred)
return output
YOLOv3的总损失定义为:
class YOLOLoss(nn.Module):
def __init__(self, num_classes, input_shape, anchors_mask = [[6,7,8], [3,4,5], [0,1,2]]):
super(YOLOLoss, self).__init__()
# 13x13的特征层对应的anchor是[116,90],[156,198],[373,326]
# 26x26的特征层对应的anchor是[30,61],[62,45],[59,119]
# 52x52的特征层对应的anchor是[10,13],[16,30],[33,23]
self.anchors = np.array([[10,13],[16,30],[33,23],[30,61],[62,45],[59,119],[116,90],[156,198],[373,326]])
self.num_classes = num_classes
self.bbox_attrs = 5 + num_classes
self.input_shape = input_shape
self.anchors_mask = anchors_mask
self.balance = [0.4, 1.0, 4]
self.box_ratio = 0.05
self.obj_ratio = 5 * (input_shape[0] * input_shape[1]) / (416 ** 2)
self.cls_ratio = 1 * (num_classes / 80)
self.ignore_threshold = 0.5
def forward(self, l, input, targets=None):
# l代表的是,当前输入进来的有效特征层,是第几个有效特征层
# input的shape为 bs, 3*(5+num_classes), 13, 13(l=0)
# bs, 3*(5+num_classes), 26, 26(l=1)
# bs, 3*(5+num_classes), 52, 52(l=2)
# targets代表的是归一化的真实框(中心位置+宽度+高度)。
# 获得图片数量,特征层的高和宽
bs = input.size(0)
in_h = input.size(2)
in_w = input.size(3)
# 计算步长
# 每一个特征点对应原来的图片上多少个像素点
# 如果特征层为13x13的话,一个特征点就对应原来的图片上的32个像素点
# 如果特征层为26x26的话,一个特征点就对应原来的图片上的16个像素点
# 如果特征层为52x52的话,一个特征点就对应原来的图片上的8个像素点
stride_h = self.input_shape[0] / in_h
stride_w = self.input_shape[1] / in_w
# 此时获得的scaled_anchors大小是相对于特征层的
scaled_anchors = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in self.anchors]
# 输入inputshape调整为:
# bs, 3*(5+num_classes), in_h, in_w => batch_size, 3, in_w, in_w, 5 + num_classes
prediction = input.view(bs, len(self.anchors_mask[l]), self.bbox_attrs, in_h, in_w).permute(0, 1, 3, 4, 2).contiguous()
x = torch.sigmoid(prediction[..., 0]) # 先验框的中心位置的调整参数
y = torch.sigmoid(prediction[..., 1])
w = prediction[..., 2] # 先验框的宽高调整参数
h = prediction[..., 3]
conf = torch.sigmoid(prediction[..., 4]) # 获得置信度,是否有物体
pred_cls = torch.sigmoid(prediction[..., 5:]) # 类别概率
# 获得网络预测结果的标签
y_true, noobj_mask, box_loss_scale = self.get_target(l, targets, scaled_anchors, in_h, in_w)
# 根据预测结果调整先验框
noobj_mask, pred_boxes = self.get_ignore(l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask)
# box_loss_scale是真实框宽高的乘积,宽高均在0-1之间,因此乘积也在0-1之间。
# 2-宽高的乘积代表真实框越大,比重越小,小框的比重更大。
box_loss_scale = 2 - box_loss_scale
loss = 0
obj_mask = y_true[..., 4] == 1
n = torch.sum(obj_mask)
if n != 0:
# 计算预测结果和真实结果的giou
giou = self.box_giou(pred_boxes, y_true[..., :4]).type_as(x)
loss_loc = torch.mean((1 - giou)[obj_mask])
loss_cls = torch.mean(self.BCELoss(pred_cls[obj_mask], y_true[..., 5:][obj_mask]))
loss += loss_loc * self.box_ratio + loss_cls * self.cls_ratio
loss_conf = torch.mean(self.BCELoss(conf, obj_mask.type_as(conf))[noobj_mask.bool() | obj_mask])
loss += loss_conf * self.balance[l] * self.obj_ratio
return loss
def get_target(self, l, targets, anchors, in_h, in_w):
# 计算一共有多少张图片
bs = len(targets)
# 用于选取哪些先验框不包含物体
noobj_mask = torch.ones(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad = False)
# 让网络更加去关注小目标
box_loss_scale = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad = False)
# 标签尺寸:batch_size, 3, in_h, in_w, 5 + num_classes
y_true = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, self.bbox_attrs, requires_grad = False)
for b in range(bs):
if len(targets[b])==0:
continue
batch_target = torch.zeros_like(targets[b])
# 计算出正样本在特征层上的中心点和类别
batch_target[:, [0,2]] = targets[b][:, [0,2]] * in_w
batch_target[:, [1,3]] = targets[b][:, [1,3]] * in_h
batch_target[:, 4] = targets[b][:, 4]
batch_target = batch_target.cpu()
gt_box = torch.FloatTensor(batch_target[:, 0:4])
anchor_shapes = torch.FloatTensor(anchors)
# 计算交并比[num_true_box, 9]每一个真实框和9个先验框的重合情况
# best_ns:[每个真实框最大的重合度max_iou, 每一个真实框最重合的先验框的序号]
best_ns = torch.argmax(self.calculate_iou(gt_box, anchor_shapes), dim=-1)
for t, best_n in enumerate(best_ns):
if best_n not in self.anchors_mask[l]:
continue
# 判断这个先验框是当前特征点的哪一个先验框
k = self.anchors_mask[l].index(best_n)
# 获得真实框属于哪个网格点
i = torch.floor(batch_target[t, 0]).long()
j = torch.floor(batch_target[t, 1]).long()
# 取出真实框的种类
c = batch_target[t, 4].long()
# noobj_mask=1 代表无目标的特征点
noobj_mask[b, k, j, i] = 0
y_true[b, k, j, i, 0] = batch_target[t, 0]
y_true[b, k, j, i, 1] = batch_target[t, 1]
y_true[b, k, j, i, 2] = batch_target[t, 2]
y_true[b, k, j, i, 3] = batch_target[t, 3]
y_true[b, k, j, i, 4] = 1
y_true[b, k, j, i, c + 5] = 1
# 大目标loss权重小,小目标loss权重大
box_loss_scale[b, k, j, i] = batch_target[t, 2] * batch_target[t, 3] / in_w / in_h
return y_true, noobj_mask, box_loss_scale
def calculate_iou(self, _box_a, _box_b):
# 计算真实框的左上角和右下角
b1_x1, b1_x2 = _box_a[:, 0] - _box_a[:, 2] / 2, _box_a[:, 0] + _box_a[:, 2] / 2
b1_y1, b1_y2 = _box_a[:, 1] - _box_a[:, 3] / 2, _box_a[:, 1] + _box_a[:, 3] / 2
# 计算先验框获得的预测框的左上角和右下角
b2_x1, b2_x2 = _box_b[:, 0] - _box_b[:, 2] / 2, _box_b[:, 0] + _box_b[:, 2] / 2
b2_y1, b2_y2 = _box_b[:, 1] - _box_b[:, 3] / 2, _box_b[:, 1] + _box_b[:, 3] / 2
# 将真实框和预测框都转化成左上角右下角的形式
box_a = torch.zeros_like(_box_a)
box_b = torch.zeros_like(_box_b)
box_a[:, 0], box_a[:, 1], box_a[:, 2], box_a[:, 3] = b1_x1, b1_y1, b1_x2, b1_y2
box_b[:, 0], box_b[:, 1], box_b[:, 2], box_b[:, 3] = b2_x1, b2_y1, b2_x2, b2_y2
# A为真实框的数量,B为先验框的数量
A = box_a.size(0)
B = box_b.size(0)
# 计算交的面积
max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), box_b[:, :2].unsqueeze(0).expand(A, B, 2))
inter = torch.clamp((max_xy - min_xy), min=0)
inter = inter[:, :, 0] * inter[:, :, 1]
# 计算预测框和真实框各自的面积
area_a = ((box_a[:, 2]-box_a[:, 0]) * (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)
area_b = ((box_b[:, 2]-box_b[:, 0]) * (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)
# 求IOU
union = area_a + area_b - inter
return inter / union
def get_ignore(self, l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask):
# 计算一共有多少张图片
bs = len(targets)
# 生成网格,先验框中心,网格左上角
grid_x = torch.linspace(0, in_w - 1, in_w).repeat(in_h, 1).repeat(
int(bs * len(self.anchors_mask[l])), 1, 1).view(x.shape).type_as(x)
grid_y = torch.linspace(0, in_h - 1, in_h).repeat(in_w, 1).t().repeat(
int(bs * len(self.anchors_mask[l])), 1, 1).view(y.shape).type_as(x)
# 生成先验框的宽高
scaled_anchors_l = np.array(scaled_anchors)[self.anchors_mask[l]]
anchor_w = torch.Tensor(scaled_anchors_l).index_select(1, torch.LongTensor([0])).type_as(x)
anchor_h = torch.Tensor(scaled_anchors_l).index_select(1, torch.LongTensor([1])).type_as(x)
anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape)
anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape)
# 计算调整后的先验框中心与宽高
pred_boxes_x = torch.unsqueeze(x + grid_x, -1)
pred_boxes_y = torch.unsqueeze(y + grid_y, -1)
pred_boxes_w = torch.unsqueeze(torch.exp(w) * anchor_w, -1)
pred_boxes_h = torch.unsqueeze(torch.exp(h) * anchor_h, -1)
pred_boxes = torch.cat([pred_boxes_x, pred_boxes_y, pred_boxes_w, pred_boxes_h], dim = -1)
for b in range(bs):
# 将预测结果转换为:pred_boxes_for_ignore [num_anchors, 4]
pred_boxes_for_ignore = pred_boxes[b].view(-1, 4)
# 计算真实框gt_box,并把真实框转换成相对于特征层的大小[num_true_box, 4]
if len(targets[b]) > 0:
batch_target = torch.zeros_like(targets[b])
# 计算出正样本在特征层上的中心点
batch_target[:, [0,2]] = targets[b][:, [0,2]] * in_w
batch_target[:, [1,3]] = targets[b][:, [1,3]] * in_h
batch_target = batch_target[:, :4].type_as(x)
# 计算交并比:anch_ious [num_true_box, num_anchors]
anch_ious = self.calculate_iou(batch_target, pred_boxes_for_ignore)
# 每个先验框对应真实框的最大重合度:anch_ious_max [num_anchors]
anch_ious_max, _ = torch.max(anch_ious, dim = 0)
anch_ious_max = anch_ious_max.view(pred_boxes[b].size()[:3])
noobj_mask[b][anch_ious_max > self.ignore_threshold] = 0
return noobj_mask, pred_boxes
在训练时累计三个特征图上的损失:
yolo_loss = YOLOLoss(num_classes, input_shape, anchors_mask)
optimizer.zero_grad()
outputs = model_train(images)
loss_value_all = 0
for l in range(len(outputs)):
loss_item = yolo_loss(l, outputs[l], targets)
loss_value_all += loss_item
loss_value = loss_value_all
loss_value.backward()
optimizer.step()