SSD:单次多边界框检测器.
1. SSD模型的结构
SSD模型是首次尝试使用卷积神经网络的金字塔特征层次来有效检测不同尺度目标的方法。该模型采用在ImageNet上训练的VGG16作为预训练网络提取图像特征,在网络顶部,SSD模型增加了一些卷积层进行空间下采样,从而提取包含不同尺度的图像的特征金字塔表示,并在每个尺度上执行目标检测,其中较大尺度的特征图用于检测小目标,较小尺度的特征图用于检测大目标。
def add_extras(in_channels, backbone_name):
layers = []
if backbone_name == 'vgg':
# Block 6
# 19,19,1024 -> 19,19,256 -> 10,10,512
layers += [nn.Conv2d(in_channels, 256, kernel_size=1, stride=1)]
layers += [nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)]
# Block 7
# 10,10,512 -> 10,10,128 -> 5,5,256
layers += [nn.Conv2d(512, 128, kernel_size=1, stride=1)]
layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
# Block 8
# 5,5,256 -> 5,5,128 -> 3,3,256
layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1)]
# Block 9
# 3,3,256 -> 3,3,128 -> 1,1,256
layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1)]
return nn.ModuleList(layers)
class SSD300(nn.Module):
def __init__(self, num_classes, backbone_name = "vgg", pretrained = False):
super(SSD300, self).__init__()
self.num_classes = num_classes
if backbone_name == "vgg":
self.vgg = add_vgg(pretrained)
self.extras = add_extras(1024, backbone_name)
self.L2Norm = L2Norm(512, 20)
mbox = [4, 6, 6, 6, 4, 4]
loc_layers = []
conf_layers = []
backbone_source = [21, -2]
#---------------------------------------------------#
# 在add_vgg获得的特征层里
# 第21层和-2层可以用来进行回归预测和分类预测。
# 分别是conv4-3(38,38,512)和conv7(19,19,1024)的输出
#---------------------------------------------------#
for k, v in enumerate(backbone_source):
loc_layers += [nn.Conv2d(self.vgg[v].out_channels, mbox[k] * 4, kernel_size = 3, padding = 1)]
conf_layers += [nn.Conv2d(self.vgg[v].out_channels, mbox[k] * num_classes, kernel_size = 3, padding = 1)]
#-------------------------------------------------------------#
# 在add_extras获得的特征层里
# 第1层、第3层、第5层、第7层可以用来进行回归预测和分类预测。
# shape分别为(10,10,512), (5,5,256), (3,3,256), (1,1,256)
#-------------------------------------------------------------#
for k, v in enumerate(self.extras[1::2], 2):
loc_layers += [nn.Conv2d(v.out_channels, mbox[k] * 4, kernel_size = 3, padding = 1)]
conf_layers += [nn.Conv2d(v.out_channels, mbox[k] * num_classes, kernel_size = 3, padding = 1)]
self.loc = nn.ModuleList(loc_layers)
self.conf = nn.ModuleList(conf_layers)
def forward(self, x):
# x是[300,300,3]
sources = list()
loc = list()
conf = list()
# 获得conv4_3的内容,shape为38,38,512
for k in range(23):
x = self.vgg[k](x)
sources.append(x)
# 获得conv7的内容,shape为19,19,1024
for k in range(23, len(self.vgg)):
x = self.vgg[k](x)
sources.append(x)
# 在add_extras获得的特征层里
# 第1层、第3层、第5层、第7层可以用来进行回归预测和分类预测。
# shape分别为(10,10,512), (5,5,256), (3,3,256), (1,1,256)
for k, v in enumerate(self.extras):
x = F.relu(v(x), inplace=True)
if k % 2 == 1:
sources.append(x)
# 为获得的6个有效特征层添加回归预测和分类预测
for (x, l, c) in zip(sources, self.loc, self.conf):
loc.append(l(x).permute(0, 2, 3, 1).contiguous())
conf.append(c(x).permute(0, 2, 3, 1).contiguous())
# 进行reshape方便堆叠
loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
output = (
loc.view(loc.size(0), -1, 4), # [batch_size, num_anchors, 4]
conf.view(conf.size(0), -1, self.num_classes), # [batch_size, num_anchors, self.num_classes]
)
return output
2. SSD模型的anchor设置
SSD模型在特征图的每个位置上预先定义一系列具有不同尺寸和长宽比的anchor,并预测这些anchor边界框位置的偏移量。每个anchor对应输入图像的一个固定位置。不同尺度的特征图具有不同大小的感受野尺寸,因此不同尺度特征图上的anchor被缩放到特定尺寸,使得每个特征图只负责检测一定尺寸范围的目标。
anchor的宽度、高度以及中心位置被归一化到$(0,1)$之间。对于第$l$个用于检测目标的特征映射,其尺寸为$m\times n$,指定一个与层级$l$对应的线性尺度和$5$个不同的长宽比,此外还有一个特殊尺度。因此在每个特征位置上共设置$6$个anchor。
\[\begin{aligned} \text{level index:} & \quad l=1,...,L \\ \text{scale of boxes:} & \quad s_l = s_{\min} + \frac{s_{\max}-s_{\min}}{L-1}(l-1) \\ \text{aspect ratio:} & \quad r \in \{ 1,2,3,1/2,1/3 \} \\ \text{additional scale:} & \quad s_l' = \sqrt{s_ls_{l+1}},r'=1 \\ \text{width:} & \quad w_l^r = s_l\sqrt{r} \\ \text{height:} & \quad h_l^r = s_l/\sqrt{r} \\ \text{center location:} & \quad (x_l^i,y_l^j) = \left(\frac{i+0.5}{m},\frac{j+0.5}{n} \right) \\ \end{aligned}\]比如当$L=6,s_{\min}=0.2,s_{\max}=0.9$时,$r=1$的anchor设置如下:
对于每一个特征位置,模型对$k$个anchor分别预测$4$个边界框位置偏移量与$c$个类别概率。则对于$m\times n$的特征图,模型输出特征尺寸为$m\times n\times k(c+4)$。
class AnchorBox():
def __init__(self, input_shape, min_size, max_size=None, aspect_ratios=None, flip=True):
self.input_shape = input_shape
self.min_size = min_size
self.max_size = max_size
self.aspect_ratios = []
for ar in aspect_ratios:
self.aspect_ratios.append(ar)
self.aspect_ratios.append(1.0 / ar)
def call(self, layer_shape, mask=None):
# 获取输入进来的特征层的宽和高,比如38x38
layer_height = layer_shape[0]
layer_width = layer_shape[1]
# 获取输入进来的图片的宽和高,比如300x300
img_height = self.input_shape[0]
img_width = self.input_shape[1]
box_widths = []
box_heights = []
# self.aspect_ratios一般有两个值:[1, 1, 2, 1/2] 或 [1, 1, 2, 1/2, 3, 1/3]
for ar in self.aspect_ratios:
# 首先添加一个较小的正方形
if ar == 1 and len(box_widths) == 0:
box_widths.append(self.min_size)
box_heights.append(self.min_size)
# 然后添加一个较大的正方形
elif ar == 1 and len(box_widths) > 0:
box_widths.append(np.sqrt(self.min_size * self.max_size))
box_heights.append(np.sqrt(self.min_size * self.max_size))
# 然后添加长方形
elif ar != 1:
box_widths.append(self.min_size * np.sqrt(ar))
box_heights.append(self.min_size / np.sqrt(ar))
# 获得所有先验框的1/2宽高
box_widths = 0.5 * np.array(box_widths)
box_heights = 0.5 * np.array(box_heights)
# 每一个特征层对应的步长
step_x = img_width / layer_width
step_y = img_height / layer_height
# 生成网格中心
linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x,
layer_width)
liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y,
layer_height)
centers_x, centers_y = np.meshgrid(linx, liny)
centers_x = centers_x.reshape(-1, 1)
centers_y = centers_y.reshape(-1, 1)
# 每一个先验框需要两个(centers_x, centers_y),前一个用来计算左上角,后一个计算右下角
num_anchors_ = len(self.aspect_ratios)
anchor_boxes = np.concatenate((centers_x, centers_y), axis=1)
anchor_boxes = np.tile(anchor_boxes, (1, 2 * num_anchors_))
# 获得先验框的左上角和右下角
anchor_boxes[:, ::4] -= box_widths
anchor_boxes[:, 1::4] -= box_heights
anchor_boxes[:, 2::4] += box_widths
anchor_boxes[:, 3::4] += box_heights
# 将先验框归一化
anchor_boxes[:, ::2] /= img_width
anchor_boxes[:, 1::2] /= img_height
anchor_boxes = anchor_boxes.reshape(-1, 4)
anchor_boxes = np.minimum(np.maximum(anchor_boxes, 0.0), 1.0)
return anchor_boxes
# 用于计算共享特征层的大小
def get_vgg_output_length(height, width):
filter_sizes = [3, 3, 3, 3, 3, 3, 3, 3]
padding = [1, 1, 1, 1, 1, 1, 0, 0]
stride = [2, 2, 2, 2, 2, 2, 1, 1]
feature_heights = []
feature_widths = []
for i in range(len(filter_sizes)):
height = (height + 2*padding[i] - filter_sizes[i]) // stride[i] + 1
width = (width + 2*padding[i] - filter_sizes[i]) // stride[i] + 1
feature_heights.append(height)
feature_widths.append(width)
return np.array(feature_heights)[-6:], np.array(feature_widths)[-6:]
def get_anchors(input_shape = [300,300], anchors_size = [30, 60, 111, 162, 213, 264, 315], backbone = 'vgg'):
feature_heights, feature_widths = get_vgg_output_length(input_shape[0], input_shape[1])
aspect_ratios = [[1, 2], [1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2], [1, 2]]
anchors = []
for i in range(len(feature_heights)):
anchor_boxes = AnchorBox(input_shape, anchors_size[i], max_size = anchors_size[i+1],
aspect_ratios = aspect_ratios[i]).call([feature_heights[i], feature_widths[i]])
anchors.append(anchor_boxes)
anchors = np.concatenate(anchors, axis=0)
return anchors
3. SSD模型的损失函数
SSD的损失函数包括anchor位置的定位损失和类别概率的分类损失:
\[\begin{aligned} \mathcal{L} & =\frac{1}{N}(\alpha\mathcal{L}_{\mathrm{loc}}+\mathcal{L}_{\mathrm{cls}} ) \end{aligned}\]其中$N$是匹配边界框的数量,$\alpha$是平衡损失的加权系数。
给定anchor预设的边界框坐标$p=(p_x,p_y,p_w,p_h)$及其标签$g=(g_x,g_y,g_w,g_h)$,分别代表边界框的中心位置及其宽度和高度。边界框回归旨在通过一个函数$d(\cdot)$学习中心位置的尺度不变变换以及宽度和高度的对数尺度变换:
\[\begin{aligned} \hat{g}_x &= p_wd_x(p) + p_x \\ \hat{g}_y &= p_hd_y(p) + p_y \\ \hat{g}_w &= p_w \exp(d_w(p)) \\ \hat{g}_h &= p_h \exp(d_h(p)) \end{aligned}\]通过采用上述变换,回归器的输出\(d_i(p),i\in \{x,y,w,h\}\)取值范围为$(-\infty,+\infty)$。回归器学习的目标为:
\[\begin{aligned} t_x &= (g_x-p_x)/p_w \\ t_y &= (g_y-p_y)/p_h \\ t_w &= \log (g_w/p_w) \\ t_h &= \log (g_h/p_h) \\ \end{aligned}\]边界框回归损失采用平滑L1损失,这是一种鲁棒的损失函数,对离群点不敏感:
\[L_1^{smooth}(x) = \begin{cases} 0.5x^2, & |x| < 1 \\ |x| - 0.5, & |x| \geq 1 \end{cases}\]则边界框回归损失表示为:
\[\begin{aligned} \mathcal{L}_{\mathrm{loc}} = \sum_{i,j} \sum_{m\in \{x,y,w,h\}} 1_{ij}^{match} L_1^{smooth}(d_m^i-t_m^j) \end{aligned}\]分类损失采用多类别softmax损失:
\[\begin{aligned} \mathcal{L}_{\mathrm{cls}} = -\sum_{i\in pos} 1_{ij}^k \log(\text{softmax}(c_i^k)) -\sum_{i\in neg} \log(\text{softmax}(c_i^0)) \end{aligned}\]其中$pos$是匹配边界框集合,$neg$是负样本集合。SSD使用负难例挖掘选择容易被误分类的负样本:把所有的anchor按照目标置信度得分进行排序,选择质量最高的proposal进行训练,使得$neg:pos$最多为$3:1$。
class MultiboxLoss(nn.Module):
def __init__(self, num_classes, alpha=1.0, neg_pos_ratio=3.0,
background_label_id=0, negatives_for_hard=100.0):
self.num_classes = num_classes
self.alpha = alpha
self.neg_pos_ratio = neg_pos_ratio
self.background_label_id = background_label_id
self.negatives_for_hard = torch.FloatTensor([negatives_for_hard])[0]
def _l1_smooth_loss(self, y_true, y_pred):
abs_loss = torch.abs(y_true - y_pred)
sq_loss = 0.5 * (y_true - y_pred)**2
l1_loss = torch.where(abs_loss < 1.0, sq_loss, abs_loss - 0.5)
return torch.sum(l1_loss, -1)
def _softmax_loss(self, y_true, y_pred):
y_pred = torch.clamp(y_pred, min = 1e-7)
softmax_loss = -torch.sum(y_true * torch.log(y_pred),
axis=-1)
return softmax_loss
def forward(self, y_true, y_pred):
# y_true batch_size, 8732, 4 + self.num_classes + 1
# y_pred batch_size, 8732, 4 + self.num_classes
num_boxes = y_true.size()[1]
y_pred = torch.cat([y_pred[0], nn.Softmax(-1)(y_pred[1])], dim = -1)
# 分类的loss:batch_size,8732,self.num_classes -> batch_size,8732
conf_loss = self._softmax_loss(y_true[:, :, 4:-1], y_pred[:, :, 4:])
# 回归的loss:batch_size,8732,4 -> batch_size,8732
loc_loss = self._l1_smooth_loss(y_true[:, :, :4],
y_pred[:, :, :4])
# 获取所有匹配框的loss
pos_loc_loss = torch.sum(loc_loss * y_true[:, :, -1],
axis=1)
pos_conf_loss = torch.sum(conf_loss * y_true[:, :, -1],
axis=1)
# 计算每一张图中正样本和负样本的个数 [batch_size,]
num_pos = torch.sum(y_true[:, :, -1], axis=-1)
num_neg = torch.min(self.neg_pos_ratio * num_pos, num_boxes - num_pos)
# 如果所有的图,负样本的数量均为0
# 那么则默认选取100个先验框作为负样本
pos_num_neg_mask = num_neg > 0
has_min = torch.sum(pos_num_neg_mask)
num_neg_batch = torch.sum(num_neg) if has_min > 0 else self.negatives_for_hard
# 把不是背景类的概率求和作为目标置信度得分
confs_start = 4 + self.background_label_id + 1
confs_end = confs_start + self.num_classes - 1
max_confs = torch.sum(y_pred[:, :, confs_start:confs_end], dim=2)
# 在整个batch里面选取最难分类的num_neg_batch个非匹配框作为负样本计算负分类损失
max_confs = (max_confs * (1 - y_true[:, :, -1])).view([-1])
_, indices = torch.topk(max_confs, k = int(num_neg_batch.cpu().numpy().tolist()))
neg_conf_loss = torch.gather(conf_loss.view([-1]), 0, indices)
# 计算总损失
num_pos = torch.where(num_pos != 0, num_pos, torch.ones_like(num_pos))
total_loss = torch.sum(pos_conf_loss) + torch.sum(neg_conf_loss) + torch.sum(self.alpha * pos_loc_loss)
total_loss = total_loss / torch.sum(num_pos)
return total_loss
4. SSD模型的数据集准备
在目标检测问题中,通常会提供图像的ground truth框标注信息。把图像送入SSD模型时,会进行尺寸调整,因此对应的标注框也应进行调整。此外,SSD模型的学习目标是预设anchor的边界框偏移量与类别概率,因此需要在数据集载入时进行ground truth框标注信息到anchor框标注信息的转换。
class SSDDataset(Dataset):
def __init__(self, annotation_lines, input_shape, anchors, batch_size, num_classes, overlap_threshold = 0.5):
super(SSDDataset, self).__init__()
self.annotation_lines = annotation_lines
self.length = len(self.annotation_lines)
self.input_shape = input_shape
self.anchors = anchors
self.num_anchors = len(anchors)
self.batch_size = batch_size
self.num_classes = num_classes
self.overlap_threshold = overlap_threshold
def __len__(self):
return self.length
def __getitem__(self, index):
index = index % self.length
image, box = self.get_random_data(self.annotation_lines[index], self.input_shape)
image_data = np.transpose(preprocess_input(np.array(image, dtype = np.float32)), (2, 0, 1))
if len(box)!=0:
boxes = np.array(box[:,:4] , dtype=np.float32)
# 进行归一化,调整到0-1之间
boxes[:, [0, 2]] = boxes[:,[0, 2]] / self.input_shape[1]
boxes[:, [1, 3]] = boxes[:,[1, 3]] / self.input_shape[0]
# 对真实框的种类进行one hot处理
one_hot_label = np.eye(self.num_classes - 1)[np.array(box[:,4], np.int32)]
box = np.concatenate([boxes, one_hot_label], axis=-1)
# ground truth框标注信息转换为anchor框标注信息
box = self.assign_boxes(box)
return np.array(image_data, np.float32), np.array(box, np.float32)
def get_random_data(self, annotation_line, input_shape):
line = annotation_line.split()
# 读取图像并转换成RGB图像
image = Image.open(line[0])
image = cvtColor(image)
# 获得图像的高宽与目标高宽
iw, ih = image.size
h, w = input_shape
# 获得预测框
box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
scale = min(w/iw, h/ih)
nw = int(iw*scale)
nh = int(ih*scale)
dx = (w-nw)//2
dy = (h-nh)//2
# 将图像多余的部分加上灰条
image = image.resize((nw,nh), Image.BICUBIC)
new_image = Image.new('RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image_data = np.array(new_image, np.float32)
# 对真实框进行调整
if len(box)>0:
np.random.shuffle(box)
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box
return image_data, box
def iou(self, box):
# 计算出每个真实框与所有的先验框的iou
# 判断真实框与先验框的重合情况
inter_upleft = np.maximum(self.anchors[:, :2], box[:2])
inter_botright = np.minimum(self.anchors[:, 2:4], box[2:])
inter_wh = inter_botright - inter_upleft
inter_wh = np.maximum(inter_wh, 0)
inter = inter_wh[:, 0] * inter_wh[:, 1]
# 真实框的面积
area_true = (box[2] - box[0]) * (box[3] - box[1])
# 先验框的面积
area_gt = (self.anchors[:, 2] - self.anchors[:, 0])*(self.anchors[:, 3] - self.anchors[:, 1])
# 计算iou
union = area_true + area_gt - inter
iou = inter / union
return iou
def encode_box(self, box, return_iou=True, variances = [0.1, 0.1, 0.2, 0.2]):
# 计算当前真实框和先验框的重合情况
# iou [self.num_anchors]
# encoded_box [self.num_anchors, 5]
iou = self.iou(box)
encoded_box = np.zeros((self.num_anchors, 4 + return_iou))
# 对每一个真实框找到重合程度较高的先验框
# 真实框可以由这个先验框来负责预测
assign_mask = iou > self.overlap_threshold
# 如果没有一个先验框重合度大于self.overlap_threshold
# 则选择重合度最大的为正样本
if not assign_mask.any():
assign_mask[iou.argmax()] = True
# 利用iou进行赋值
if return_iou:
encoded_box[:, -1][assign_mask] = iou[assign_mask]
# 找到对应的先验框
assigned_anchors = self.anchors[assign_mask]
# 逆向编码,将真实框转化为ssd预测结果的格式
# 先计算真实框的中心与长宽
box_center = 0.5 * (box[:2] + box[2:])
box_wh = box[2:] - box[:2]
# 再计算重合度较高的先验框的中心与长宽
assigned_anchors_center = (assigned_anchors[:, 0:2] + assigned_anchors[:, 2:4]) * 0.5
assigned_anchors_wh = (assigned_anchors[:, 2:4] - assigned_anchors[:, 0:2])
# 先求取中心的预测结果,再求取宽高的预测结果
# 存在改变数量级的参数,默认为[0.1,0.1,0.2,0.2]
encoded_box[:, :2][assign_mask] = box_center - assigned_anchors_center
encoded_box[:, :2][assign_mask] /= assigned_anchors_wh
encoded_box[:, :2][assign_mask] /= np.array(variances)[:2]
encoded_box[:, 2:4][assign_mask] = np.log(box_wh / assigned_anchors_wh)
encoded_box[:, 2:4][assign_mask] /= np.array(variances)[2:4]
return encoded_box.ravel()
def assign_boxes(self, boxes):
# assignment分为3个部分
# :4 的内容为网络应该有的回归预测结果
# 4:-1 的内容为先验框所对应的种类,默认为背景
# -1 的内容为当前先验框是否包含目标
assignment = np.zeros((self.num_anchors, 4 + self.num_classes + 1))
assignment[:, 4] = 1.0
if len(boxes) == 0:
return assignment
# 对每一个真实框都进行iou计算
encoded_boxes = np.apply_along_axis(self.encode_box, 1, boxes[:, :4])
# 在reshape后,获得的encoded_boxes的shape为:
# [num_true_box, num_anchors, 4 + 1],4是编码后的结果,1为iou
encoded_boxes = encoded_boxes.reshape(-1, self.num_anchors, 5)
# [num_anchors]求取每一个先验框重合度最大的真实框
best_iou = encoded_boxes[:, :, -1].max(axis=0)
best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0)
best_iou_mask = best_iou > 0
best_iou_idx = best_iou_idx[best_iou_mask]
# 计算一共有多少先验框满足需求
assign_num = len(best_iou_idx)
# 将编码后的真实框取出
encoded_boxes = encoded_boxes[:, best_iou_mask, :]
# 编码后的真实框的赋值
assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx, np.arange(assign_num), :4]
# 4代表为背景的概率,设定为0,因为这些先验框有对应的物体
assignment[:, 4][best_iou_mask] = 0
assignment[:, 5:-1][best_iou_mask] = boxes[best_iou_idx, 4:]
# -1表示先验框是否有对应的物体
assignment[:, -1][best_iou_mask] = 1
return assignment
SSD的完整PyTorch实现可参考 ssd-pytorch。