1. SSD模型的结构
def add_extras(in_channels, backbone_name):
layers = []
if backbone_name == 'vgg':
# Block 6
# 19,19,1024 -> 19,19,256 -> 10,10,512
layers += [nn.Conv2d(in_channels, 256, kernel_size=1, stride=1)]
layers += [nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)]
# Block 7
# 10,10,512 -> 10,10,128 -> 5,5,256
layers += [nn.Conv2d(512, 128, kernel_size=1, stride=1)]
layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
# Block 8
# 5,5,256 -> 5,5,128 -> 3,3,256
layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1)]
# Block 9
# 3,3,256 -> 3,3,128 -> 1,1,256
layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1)]
return nn.ModuleList(layers)
class SSD300(nn.Module):
def __init__(self, num_classes, backbone_name = "vgg", pretrained = False):
super(SSD300, self).__init__()
self.num_classes = num_classes
if backbone_name == "vgg":
self.vgg = add_vgg(pretrained)
self.extras = add_extras(1024, backbone_name)
self.L2Norm = L2Norm(512, 20)
mbox = [4, 6, 6, 6, 4, 4]
loc_layers = []
conf_layers = []
backbone_source = [21, -2]
# 在add_vgg获得的特征层里
# 第21层和-2层可以用来进行回归预测和分类预测。
# 分别是conv4-3(38,38,512)和conv7(19,19,1024)的输出
for k, v in enumerate(backbone_source):
loc_layers += [nn.Conv2d(self.vgg[v].out_channels, mbox[k] * 4, kernel_size = 3, padding = 1)]
conf_layers += [nn.Conv2d(self.vgg[v].out_channels, mbox[k] * num_classes, kernel_size = 3, padding = 1)]
# 在add_extras获得的特征层里
# 第1层、第3层、第5层、第7层可以用来进行回归预测和分类预测。
# shape分别为(10,10,512), (5,5,256), (3,3,256), (1,1,256)
for k, v in enumerate(self.extras[1::2], 2):
loc_layers += [nn.Conv2d(v.out_channels, mbox[k] * 4, kernel_size = 3, padding = 1)]
conf_layers += [nn.Conv2d(v.out_channels, mbox[k] * num_classes, kernel_size = 3, padding = 1)]
self.loc = nn.ModuleList(loc_layers)
self.conf = nn.ModuleList(conf_layers)
def forward(self, x):
# x是[300,300,3]
sources = list()
loc = list()
conf = list()
# 获得conv4_3的内容,shape为38,38,512
for k in range(23):
x = self.vgg[k](x)
# 获得conv7的内容,shape为19,19,1024
for k in range(23, len(self.vgg)):
x = self.vgg[k](x)
# 在add_extras获得的特征层里
# 第1层、第3层、第5层、第7层可以用来进行回归预测和分类预测。
# shape分别为(10,10,512), (5,5,256), (3,3,256), (1,1,256)
for k, v in enumerate(self.extras):
x = F.relu(v(x), inplace=True)
if k % 2 == 1:
# 为获得的6个有效特征层添加回归预测和分类预测
for (x, l, c) in zip(sources, self.loc, self.conf):
loc.append(l(x).permute(0, 2, 3, 1).contiguous())
conf.append(c(x).permute(0, 2, 3, 1).contiguous())
# 进行reshape方便堆叠
loc =[o.view(o.size(0), -1) for o in loc], 1)
conf =[o.view(o.size(0), -1) for o in conf], 1)
output = (
loc.view(loc.size(0), -1, 4), # [batch_size, num_anchors, 4]
conf.view(conf.size(0), -1, self.num_classes), # [batch_size, num_anchors, self.num_classes]
return output
2. SSD模型的anchor设置
anchor的宽度、高度以及中心位置被归一化到$(0,1)$之间。对于第$l$个用于检测目标的特征映射,其尺寸为$m\times n$,指定一个与层级$l$对应的线性尺度和$5$个不同的长宽比,此外还有一个特殊尺度。因此在每个特征位置上共设置$6$个anchor。
\[\begin{aligned} \text{level index:} & \quad l=1,...,L \\ \text{scale of boxes:} & \quad s_l = s_{\min} + \frac{s_{\max}-s_{\min}}{L-1}(l-1) \\ \text{aspect ratio:} & \quad r \in \{ 1,2,3,1/2,1/3 \} \\ \text{additional scale:} & \quad s_l' = \sqrt{s_ls_{l+1}},r'=1 \\ \text{width:} & \quad w_l^r = s_l\sqrt{r} \\ \text{height:} & \quad h_l^r = s_l/\sqrt{r} \\ \text{center location:} & \quad (x_l^i,y_l^j) = \left(\frac{i+0.5}{m},\frac{j+0.5}{n} \right) \\ \end{aligned}\]比如当$L=6,s_{\min}=0.2,s_{\max}=0.9$时,$r=1$的anchor设置如下:
对于每一个特征位置,模型对$k$个anchor分别预测$4$个边界框位置偏移量与$c$个类别概率。则对于$m\times n$的特征图,模型输出特征尺寸为$m\times n\times k(c+4)$。
class AnchorBox():
def __init__(self, input_shape, min_size, max_size=None, aspect_ratios=None, flip=True):
self.input_shape = input_shape
self.min_size = min_size
self.max_size = max_size
self.aspect_ratios = []
for ar in aspect_ratios:
self.aspect_ratios.append(1.0 / ar)
def call(self, layer_shape, mask=None):
# 获取输入进来的特征层的宽和高,比如38x38
layer_height = layer_shape[0]
layer_width = layer_shape[1]
# 获取输入进来的图片的宽和高,比如300x300
img_height = self.input_shape[0]
img_width = self.input_shape[1]
box_widths = []
box_heights = []
# self.aspect_ratios一般有两个值:[1, 1, 2, 1/2] 或 [1, 1, 2, 1/2, 3, 1/3]
for ar in self.aspect_ratios:
# 首先添加一个较小的正方形
if ar == 1 and len(box_widths) == 0:
# 然后添加一个较大的正方形
elif ar == 1 and len(box_widths) > 0:
box_widths.append(np.sqrt(self.min_size * self.max_size))
box_heights.append(np.sqrt(self.min_size * self.max_size))
# 然后添加长方形
elif ar != 1:
box_widths.append(self.min_size * np.sqrt(ar))
box_heights.append(self.min_size / np.sqrt(ar))
# 获得所有先验框的1/2宽高
box_widths = 0.5 * np.array(box_widths)
box_heights = 0.5 * np.array(box_heights)
# 每一个特征层对应的步长
step_x = img_width / layer_width
step_y = img_height / layer_height
# 生成网格中心
linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x,
liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y,
centers_x, centers_y = np.meshgrid(linx, liny)
centers_x = centers_x.reshape(-1, 1)
centers_y = centers_y.reshape(-1, 1)
# 每一个先验框需要两个(centers_x, centers_y),前一个用来计算左上角,后一个计算右下角
num_anchors_ = len(self.aspect_ratios)
anchor_boxes = np.concatenate((centers_x, centers_y), axis=1)
anchor_boxes = np.tile(anchor_boxes, (1, 2 * num_anchors_))
# 获得先验框的左上角和右下角
anchor_boxes[:, ::4] -= box_widths
anchor_boxes[:, 1::4] -= box_heights
anchor_boxes[:, 2::4] += box_widths
anchor_boxes[:, 3::4] += box_heights
# 将先验框归一化
anchor_boxes[:, ::2] /= img_width
anchor_boxes[:, 1::2] /= img_height
anchor_boxes = anchor_boxes.reshape(-1, 4)
anchor_boxes = np.minimum(np.maximum(anchor_boxes, 0.0), 1.0)
return anchor_boxes
# 用于计算共享特征层的大小
def get_vgg_output_length(height, width):
filter_sizes = [3, 3, 3, 3, 3, 3, 3, 3]
padding = [1, 1, 1, 1, 1, 1, 0, 0]
stride = [2, 2, 2, 2, 2, 2, 1, 1]
feature_heights = []
feature_widths = []
for i in range(len(filter_sizes)):
height = (height + 2*padding[i] - filter_sizes[i]) // stride[i] + 1
width = (width + 2*padding[i] - filter_sizes[i]) // stride[i] + 1
return np.array(feature_heights)[-6:], np.array(feature_widths)[-6:]
def get_anchors(input_shape = [300,300], anchors_size = [30, 60, 111, 162, 213, 264, 315], backbone = 'vgg'):
feature_heights, feature_widths = get_vgg_output_length(input_shape[0], input_shape[1])
aspect_ratios = [[1, 2], [1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2], [1, 2]]
anchors = []
for i in range(len(feature_heights)):
anchor_boxes = AnchorBox(input_shape, anchors_size[i], max_size = anchors_size[i+1],
aspect_ratios = aspect_ratios[i]).call([feature_heights[i], feature_widths[i]])
anchors = np.concatenate(anchors, axis=0)
return anchors
3. SSD模型的损失函数
\[\begin{aligned} \mathcal{L} & =\frac{1}{N}(\alpha\mathcal{L}_{\mathrm{loc}}+\mathcal{L}_{\mathrm{cls}} ) \end{aligned}\]其中$N$是匹配边界框的数量,$\alpha$是平衡损失的加权系数。
\[\begin{aligned} \hat{g}_x &= p_wd_x(p) + p_x \\ \hat{g}_y &= p_hd_y(p) + p_y \\ \hat{g}_w &= p_w \exp(d_w(p)) \\ \hat{g}_h &= p_h \exp(d_h(p)) \end{aligned}\]通过采用上述变换,回归器的输出\(d_i(p),i\in \{x,y,w,h\}\)取值范围为$(-\infty,+\infty)$。回归器学习的目标为:
\[\begin{aligned} t_x &= (g_x-p_x)/p_w \\ t_y &= (g_y-p_y)/p_h \\ t_w &= \log (g_w/p_w) \\ t_h &= \log (g_h/p_h) \\ \end{aligned}\]边界框回归损失采用平滑L1损失,这是一种鲁棒的损失函数,对离群点不敏感:
\[L_1^{smooth}(x) = \begin{cases} 0.5x^2, & |x| < 1 \\ |x| - 0.5, & |x| \geq 1 \end{cases}\]则边界框回归损失表示为:
\[\begin{aligned} \mathcal{L}_{\mathrm{loc}} = \sum_{i,j} \sum_{m\in \{x,y,w,h\}} 1_{ij}^{match} L_1^{smooth}(d_m^i-t_m^j) \end{aligned}\]分类损失采用多类别softmax损失:
\[\begin{aligned} \mathcal{L}_{\mathrm{cls}} = -\sum_{i\in pos} 1_{ij}^k \log(\text{softmax}(c_i^k)) -\sum_{i\in neg} \log(\text{softmax}(c_i^0)) \end{aligned}\]其中$pos$是匹配边界框集合,$neg$是负样本集合。SSD使用负难例挖掘选择容易被误分类的负样本:把所有的anchor按照目标置信度得分进行排序,选择质量最高的proposal进行训练,使得$neg:pos$最多为$3:1$。
class MultiboxLoss(nn.Module):
def __init__(self, num_classes, alpha=1.0, neg_pos_ratio=3.0,
background_label_id=0, negatives_for_hard=100.0):
self.num_classes = num_classes
self.alpha = alpha
self.neg_pos_ratio = neg_pos_ratio
self.background_label_id = background_label_id
self.negatives_for_hard = torch.FloatTensor([negatives_for_hard])[0]
def _l1_smooth_loss(self, y_true, y_pred):
abs_loss = torch.abs(y_true - y_pred)
sq_loss = 0.5 * (y_true - y_pred)**2
l1_loss = torch.where(abs_loss < 1.0, sq_loss, abs_loss - 0.5)
return torch.sum(l1_loss, -1)
def _softmax_loss(self, y_true, y_pred):
y_pred = torch.clamp(y_pred, min = 1e-7)
softmax_loss = -torch.sum(y_true * torch.log(y_pred),
return softmax_loss
def forward(self, y_true, y_pred):
# y_true batch_size, 8732, 4 + self.num_classes + 1
# y_pred batch_size, 8732, 4 + self.num_classes
num_boxes = y_true.size()[1]
y_pred =[y_pred[0], nn.Softmax(-1)(y_pred[1])], dim = -1)
# 分类的loss:batch_size,8732,self.num_classes -> batch_size,8732
conf_loss = self._softmax_loss(y_true[:, :, 4:-1], y_pred[:, :, 4:])
# 回归的loss:batch_size,8732,4 -> batch_size,8732
loc_loss = self._l1_smooth_loss(y_true[:, :, :4],
y_pred[:, :, :4])
# 获取所有匹配框的loss
pos_loc_loss = torch.sum(loc_loss * y_true[:, :, -1],
pos_conf_loss = torch.sum(conf_loss * y_true[:, :, -1],
# 计算每一张图中正样本和负样本的个数 [batch_size,]
num_pos = torch.sum(y_true[:, :, -1], axis=-1)
num_neg = torch.min(self.neg_pos_ratio * num_pos, num_boxes - num_pos)
# 如果所有的图,负样本的数量均为0
# 那么则默认选取100个先验框作为负样本
pos_num_neg_mask = num_neg > 0
has_min = torch.sum(pos_num_neg_mask)
num_neg_batch = torch.sum(num_neg) if has_min > 0 else self.negatives_for_hard
# 把不是背景类的概率求和作为目标置信度得分
confs_start = 4 + self.background_label_id + 1
confs_end = confs_start + self.num_classes - 1
max_confs = torch.sum(y_pred[:, :, confs_start:confs_end], dim=2)
# 在整个batch里面选取最难分类的num_neg_batch个非匹配框作为负样本计算负分类损失
max_confs = (max_confs * (1 - y_true[:, :, -1])).view([-1])
_, indices = torch.topk(max_confs, k = int(num_neg_batch.cpu().numpy().tolist()))
neg_conf_loss = torch.gather(conf_loss.view([-1]), 0, indices)
# 计算总损失
num_pos = torch.where(num_pos != 0, num_pos, torch.ones_like(num_pos))
total_loss = torch.sum(pos_conf_loss) + torch.sum(neg_conf_loss) + torch.sum(self.alpha * pos_loc_loss)
total_loss = total_loss / torch.sum(num_pos)
return total_loss
4. SSD模型的数据集准备
在目标检测问题中,通常会提供图像的ground truth框标注信息。把图像送入SSD模型时,会进行尺寸调整,因此对应的标注框也应进行调整。此外,SSD模型的学习目标是预设anchor的边界框偏移量与类别概率,因此需要在数据集载入时进行ground truth框标注信息到anchor框标注信息的转换。
class SSDDataset(Dataset):
def __init__(self, annotation_lines, input_shape, anchors, batch_size, num_classes, overlap_threshold = 0.5):
super(SSDDataset, self).__init__()
self.annotation_lines = annotation_lines
self.length = len(self.annotation_lines)
self.input_shape = input_shape
self.anchors = anchors
self.num_anchors = len(anchors)
self.batch_size = batch_size
self.num_classes = num_classes
self.overlap_threshold = overlap_threshold
def __len__(self):
return self.length
def __getitem__(self, index):
index = index % self.length
image, box = self.get_random_data(self.annotation_lines[index], self.input_shape)
image_data = np.transpose(preprocess_input(np.array(image, dtype = np.float32)), (2, 0, 1))
if len(box)!=0:
boxes = np.array(box[:,:4] , dtype=np.float32)
# 进行归一化,调整到0-1之间
boxes[:, [0, 2]] = boxes[:,[0, 2]] / self.input_shape[1]
boxes[:, [1, 3]] = boxes[:,[1, 3]] / self.input_shape[0]
# 对真实框的种类进行one hot处理
one_hot_label = np.eye(self.num_classes - 1)[np.array(box[:,4], np.int32)]
box = np.concatenate([boxes, one_hot_label], axis=-1)
# ground truth框标注信息转换为anchor框标注信息
box = self.assign_boxes(box)
return np.array(image_data, np.float32), np.array(box, np.float32)
def get_random_data(self, annotation_line, input_shape):
line = annotation_line.split()
# 读取图像并转换成RGB图像
image =[0])
image = cvtColor(image)
# 获得图像的高宽与目标高宽
iw, ih = image.size
h, w = input_shape
# 获得预测框
box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
scale = min(w/iw, h/ih)
nw = int(iw*scale)
nh = int(ih*scale)
dx = (w-nw)//2
dy = (h-nh)//2
# 将图像多余的部分加上灰条
image = image.resize((nw,nh), Image.BICUBIC)
new_image ='RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image_data = np.array(new_image, np.float32)
# 对真实框进行调整
if len(box)>0:
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box
return image_data, box
def iou(self, box):
# 计算出每个真实框与所有的先验框的iou
# 判断真实框与先验框的重合情况
inter_upleft = np.maximum(self.anchors[:, :2], box[:2])
inter_botright = np.minimum(self.anchors[:, 2:4], box[2:])
inter_wh = inter_botright - inter_upleft
inter_wh = np.maximum(inter_wh, 0)
inter = inter_wh[:, 0] * inter_wh[:, 1]
# 真实框的面积
area_true = (box[2] - box[0]) * (box[3] - box[1])
# 先验框的面积
area_gt = (self.anchors[:, 2] - self.anchors[:, 0])*(self.anchors[:, 3] - self.anchors[:, 1])
# 计算iou
union = area_true + area_gt - inter
iou = inter / union
return iou
def encode_box(self, box, return_iou=True, variances = [0.1, 0.1, 0.2, 0.2]):
# 计算当前真实框和先验框的重合情况
# iou [self.num_anchors]
# encoded_box [self.num_anchors, 5]
iou = self.iou(box)
encoded_box = np.zeros((self.num_anchors, 4 + return_iou))
# 对每一个真实框找到重合程度较高的先验框
# 真实框可以由这个先验框来负责预测
assign_mask = iou > self.overlap_threshold
# 如果没有一个先验框重合度大于self.overlap_threshold
# 则选择重合度最大的为正样本
if not assign_mask.any():
assign_mask[iou.argmax()] = True
# 利用iou进行赋值
if return_iou:
encoded_box[:, -1][assign_mask] = iou[assign_mask]
# 找到对应的先验框
assigned_anchors = self.anchors[assign_mask]
# 逆向编码,将真实框转化为ssd预测结果的格式
# 先计算真实框的中心与长宽
box_center = 0.5 * (box[:2] + box[2:])
box_wh = box[2:] - box[:2]
# 再计算重合度较高的先验框的中心与长宽
assigned_anchors_center = (assigned_anchors[:, 0:2] + assigned_anchors[:, 2:4]) * 0.5
assigned_anchors_wh = (assigned_anchors[:, 2:4] - assigned_anchors[:, 0:2])
# 先求取中心的预测结果,再求取宽高的预测结果
# 存在改变数量级的参数,默认为[0.1,0.1,0.2,0.2]
encoded_box[:, :2][assign_mask] = box_center - assigned_anchors_center
encoded_box[:, :2][assign_mask] /= assigned_anchors_wh
encoded_box[:, :2][assign_mask] /= np.array(variances)[:2]
encoded_box[:, 2:4][assign_mask] = np.log(box_wh / assigned_anchors_wh)
encoded_box[:, 2:4][assign_mask] /= np.array(variances)[2:4]
return encoded_box.ravel()
def assign_boxes(self, boxes):
# assignment分为3个部分
# :4 的内容为网络应该有的回归预测结果
# 4:-1 的内容为先验框所对应的种类,默认为背景
# -1 的内容为当前先验框是否包含目标
assignment = np.zeros((self.num_anchors, 4 + self.num_classes + 1))
assignment[:, 4] = 1.0
if len(boxes) == 0:
return assignment
# 对每一个真实框都进行iou计算
encoded_boxes = np.apply_along_axis(self.encode_box, 1, boxes[:, :4])
# 在reshape后,获得的encoded_boxes的shape为:
# [num_true_box, num_anchors, 4 + 1],4是编码后的结果,1为iou
encoded_boxes = encoded_boxes.reshape(-1, self.num_anchors, 5)
# [num_anchors]求取每一个先验框重合度最大的真实框
best_iou = encoded_boxes[:, :, -1].max(axis=0)
best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0)
best_iou_mask = best_iou > 0
best_iou_idx = best_iou_idx[best_iou_mask]
# 计算一共有多少先验框满足需求
assign_num = len(best_iou_idx)
# 将编码后的真实框取出
encoded_boxes = encoded_boxes[:, best_iou_mask, :]
# 编码后的真实框的赋值
assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx, np.arange(assign_num), :4]
# 4代表为背景的概率,设定为0,因为这些先验框有对应的物体
assignment[:, 4][best_iou_mask] = 0
assignment[:, 5:-1][best_iou_mask] = boxes[best_iou_idx, 4:]
# -1表示先验框是否有对应的物体
assignment[:, -1][best_iou_mask] = 1
return assignment
SSD的完整PyTorch实现可参考 ssd-pytorch。