利用MTCNN和Facenet实现实时人脸识别

MTCNN做人脸检测和对齐

  将需要识别的含有人脸的图片放在’./images/test_img’,代码首先使用detection_face_and_crop对提供的人脸图像进行预处理(人脸检测和对齐),将预处理之后的人脸图像存放到’./images/emb_img’目录下。
  三个重要参数:
  minisize: 图片中人脸的最小尺寸,控制人脸金字塔阶数参数之一,其值越小,可计算的阶数越多,计算量越大。
  threhold: MTCNN中三个网络人脸框的阈值,三个阈值可以分别设置,这里分别设置为0.6、0.7、0.7。阈值太小将会导致人脸框太多,增加计算量;还可能导致不是人脸的图像检测为人脸。
  factor: 生成图像金字塔时候的缩放系数, 范围(0,1),可控制图像金字塔的阶层数的参数之一,越大,阶层越多,计算量越大。
  detect_face()返回值为人脸框的坐标以及是人脸的概率。

动态分配显存

1
2
3
config = tf.ConfigProto()
config.gpu_options.allow_growth = True # increase slowly until to max capacity when use GPU
sess = tf.Session(config=config)

detection_face_and_crop.py代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from cv2 import cv2
from scipy import misc
import tensorflow as tf
import numpy as np
import sys
import os
import copy
import argparse
import facenet
import align.detect_face
def main():
img_dir='./images/test_img'
img_path_set=[] #path of every images
for file in os.listdir(img_dir):
single_img=os.path.join(img_dir,file)
print('loading: {}'.format(file))
img_path_set.append(single_img)

images = load_and_align_data(img_path_set, 160, 44)
emb_dir='./images/emb_img'

if(os.path.exists(emb_dir)==False):
os.mkdir(emb_dir)

count=0
for file in os.listdir(img_dir):
print("save {} ".format(file))
misc.imsave(os.path.join(emb_dir,file),images[count])
count=count+1
print("get {} faces".format(count))

def load_and_align_data(image_paths, image_size, margin):

minisize = 20 # minimum size of face
threshold = [ 0.6, 0.7, 0.7 ] # threshold of bounding_boxe in three nets
factor = 0.709 # scale factor of face pyramid

print('Creating networks and loading parameters')
with tf.Graph().as_default():
# apply video memory dynamically
config = tf.ConfigProto()
config.gpu_options.allow_growth = True # increase slowly until to max capacity when use GPU
# config.gpu_options.per_process_gpu_memory_fraction = 0.4 # use 40% capacity of GPU
sess = tf.Session(config=config)
with sess.as_default():
pnet, rnet, onet = align.detect_face.create_mtcnn(sess, None)

tmp_image_paths=copy.copy(image_paths)
img_list = []
for image in tmp_image_paths:
print(image)
img = misc.imread(os.path.expanduser(image), mode='RGB')
img_size = np.asarray(img.shape)[0:2]
bounding_boxes, _ = align.detect_face.detect_face(img, minisize, pnet, rnet, onet, threshold, factor)
print(bounding_boxes)
if len(bounding_boxes) < 1:
image_paths.remove(image)
print("can't detect face, remove ", image)
continue

det = np.squeeze(bounding_boxes[0,0:4])

bb = np.zeros(4, dtype=np.int32)

bb[0] = np.maximum(det[0]-margin/2, 0)
bb[1] = np.maximum(det[1]-margin/2, 0)
bb[2] = np.minimum(det[2]+margin/2, img_size[1])
bb[3] = np.minimum(det[3]+margin/2, img_size[0])
cropped = img[bb[1]:bb[3],bb[0]:bb[2],:]

# resize images deal with alignd
aligned = misc.imresize(cropped, (image_size, image_size), interp='bilinear')
prewhitened = facenet.prewhiten(aligned)
img_list.append(prewhitened)
images = np.stack(img_list)
return images


if __name__=='__main__':
main()

利用facenet和mtcnn做人脸识别

  利用opencv调取电脑或者网络摄像头,也可以读取视频进行识别。
读取网络摄像头,当使用本地摄像头时,VideoCapture()参数设置为“0”:

1
2
video="http://admin:admin@192.168.137.33:8081/"
capture =cv2.VideoCapture(video)

  读取视频同时设置读入视频的宽高和帧率

1
2
3
4
5
dirVideo = "video1.mp4"
capture =cv2.VideoCapture(dirVideo)
capture.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
capture.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)
capture.set(cv2.CAP_PROP_FPS, 60)

  当需要将opencv的视频输出时,使用opencv的VideoWriter()输出视频:

1
2
3
size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
fourcc = cv2.VideoWriter_fourcc(*'XVID')
writeVideo = cv2.VideoWriter('output.avi',fourcc, 20, size, 1)

  这里以一个短视频人脸识别为例,首先将原视频输入,利用opencv获取视频帧,这里是将原视频中每三帧取一帧来做人脸距离计算。主要过程是将opencv读取的画面做人脸检测和对齐(使用MTCNN网络),再将得到的人脸图像和从emb_img中读取的包含人脸的图像做距离计算,将距离小于阈值的图像对应的标签放到列表。最后就是在视频中绘制人脸框并将对应的标签显示在框上,最终效果如下图:
neural
  完整源代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# video="http://admin:admin@192.168.137.33:8081/"
# capture =cv2.VideoCapture(video)
dirVideo = "video1.mp4"
capture =cv2.VideoCapture(dirVideo)
# capture.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
# capture.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)
capture.set(cv2.CAP_PROP_FPS, 60)

# size =(int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)),int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)))
# fourcc = cv2.VideoWriter_fourcc('M','J','P','G')
# writeVideo = cv2.VideoWriter("aaa.avi", fourcc, 5, size)
# size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
# fourcc = cv2.VideoWriter_fourcc(*'XVID')
# writeVideo = cv2.VideoWriter('output.avi',fourcc, 20, size, 1)
cv2.namedWindow("camera",1)
picNumber = 0
count = 0
frame_interval = 3
while True:
isSuccess, frame = capture.read()
if(count % frame_interval == 0):
rgb_frame=cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
tag, bounding_box, crop_image, =load_and_align_data(rgb_frame,160,44)
if(tag):
feed_dict = { images_placeholder: crop_image, phase_train_placeholder:False }
emb = sess.run(embeddings, feed_dict=feed_dict)
print(emb)
temp_num=len(emb)
fin_obj=[]
# calculate distance between camera face and in emd_img face
for i in range(temp_num):
dist_list=[]
for j in range(compare_num):
dist = np.sqrt(np.sum(np.square(np.subtract(emb[i,:], compare_emb[j,:]))))

dist_list.append(dist)
min_value=min(dist_list)
if(min_value>0.65):
fin_obj.append('UNKNOW')
else:
fin_obj.append(all_obj[dist_list.index(min_value)][0:6]) #mini distance is face which recongnition
# draw rectangle
for rec_position in range(temp_num):
cv2.rectangle(frame,
(bounding_box[rec_position,0],
bounding_box[rec_position,1]),
(bounding_box[rec_position,2],
bounding_box[rec_position,3]),
(0, 255, 0), 2, 8, 0)
cv2.putText(frame,
fin_obj[rec_position],
(bounding_box[rec_position,0],bounding_box[rec_position,1]),
cv2.FONT_HERSHEY_COMPLEX_SMALL,
0.8,
(0, 0 ,255),
thickness = 2,
lineType = 2)
# writeVideo.write(frame)
cv2.imshow('camera',frame)
count += 1
key = cv2.waitKey(3)
if key == 27:
print("ESC break")
break
if key == ord(' '):
picNumber += 1
# filename = "{}_{}.jpg".format(dirVideo, picNumber)
filename = "%s_%s.jpg" % (dirVideo, picNumber)
cv2.imwrite(filename,frame)
capture.release()
cv2.destroyWindow("camera")