利用MTCNN和Facenet实现实时人脸识别

MTCNN做人脸检测和对齐

将需要识别的含有人脸的图片放在’./images/test_img’,代码首先使用detection_face_and_crop对提供的人脸图像进行预处理（人脸检测和对齐）,将预处理之后的人脸图像存放到’./images/emb_img’目录下。
三个重要参数：
minisize: 图片中人脸的最小尺寸，控制人脸金字塔阶数参数之一，其值越小，可计算的阶数越多，计算量越大。
threhold: MTCNN中三个网络人脸框的阈值，三个阈值可以分别设置，这里分别设置为0.6、0.7、0.7。阈值太小将会导致人脸框太多，增加计算量；还可能导致不是人脸的图像检测为人脸。
factor: 生成图像金字塔时候的缩放系数, 范围(0,1)，可控制图像金字塔的阶层数的参数之一，越大，阶层越多，计算量越大。
detect_face()返回值为人脸框的坐标以及是人脸的概率。

动态分配显存

1
2
3

config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # increase slowly until to max capacity when use GPU 
sess = tf.Session(config=config)

detection_face_and_crop.py代码如下：

from cv2 import cv2
from scipy import misc
import tensorflow as tf
import numpy as np
import sys
import os
import copy
import argparse
import facenet
import align.detect_face
def main():
    img_dir='./images/test_img'
    img_path_set=[]     #path of every images
    for file in os.listdir(img_dir):
        single_img=os.path.join(img_dir,file)
        print('loading: {}'.format(file))
        img_path_set.append(single_img)

    images = load_and_align_data(img_path_set, 160, 44)
    emb_dir='./images/emb_img'
    
    if(os.path.exists(emb_dir)==False):
        os.mkdir(emb_dir)

    count=0
    for file in os.listdir(img_dir):
        print("save {} ".format(file))
        misc.imsave(os.path.join(emb_dir,file),images[count])
        count=count+1
    print("get {} faces".format(count))
    
def load_and_align_data(image_paths, image_size, margin):

    minisize = 20 # minimum size of face
    threshold = [ 0.6, 0.7, 0.7 ]  # threshold of bounding_boxe in three nets
    factor = 0.709 # scale factor of face pyramid
    
    print('Creating networks and loading parameters')
    with tf.Graph().as_default():
        # apply video memory dynamically
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True  # increase slowly until to max capacity when use GPU 
        # config.gpu_options.per_process_gpu_memory_fraction = 0.4 # use 40% capacity of GPU
        sess = tf.Session(config=config)
        with sess.as_default():
            pnet, rnet, onet = align.detect_face.create_mtcnn(sess, None)
            
    tmp_image_paths=copy.copy(image_paths)
    img_list = []
    for image in tmp_image_paths:
        print(image)
        img = misc.imread(os.path.expanduser(image), mode='RGB')
        img_size = np.asarray(img.shape)[0:2]
        bounding_boxes, _ = align.detect_face.detect_face(img, minisize, pnet, rnet, onet, threshold, factor)
        print(bounding_boxes)
        if len(bounding_boxes) < 1:
          image_paths.remove(image)
          print("can't detect face, remove ", image)
          continue

        det = np.squeeze(bounding_boxes[0,0:4])
        
        bb = np.zeros(4, dtype=np.int32)

        bb[0] = np.maximum(det[0]-margin/2, 0)
        bb[1] = np.maximum(det[1]-margin/2, 0)
        bb[2] = np.minimum(det[2]+margin/2, img_size[1])
        bb[3] = np.minimum(det[3]+margin/2, img_size[0])
        cropped = img[bb[1]:bb[3],bb[0]:bb[2],:]

        # resize images deal with alignd
        aligned = misc.imresize(cropped, (image_size, image_size), interp='bilinear')
        prewhitened = facenet.prewhiten(aligned)
        img_list.append(prewhitened)
    images = np.stack(img_list)
    return images


if __name__=='__main__':
    main()

利用facenet和mtcnn做人脸识别

利用opencv调取电脑或者网络摄像头，也可以读取视频进行识别。
读取网络摄像头,当使用本地摄像头时,VideoCapture()参数设置为“0”：

1 2	video="http://admin:admin@192.168.137.33:8081/" capture =cv2.VideoCapture(video)

读取视频同时设置读入视频的宽高和帧率

dirVideo = "video1.mp4"
capture =cv2.VideoCapture(dirVideo)
capture.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
capture.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)
capture.set(cv2.CAP_PROP_FPS, 60)

当需要将opencv的视频输出时，使用opencv的VideoWriter()输出视频：

1
2
3

size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
fourcc = cv2.VideoWriter_fourcc(*'XVID')
writeVideo = cv2.VideoWriter('output.avi',fourcc, 20, size, 1)

这里以一个短视频人脸识别为例，首先将原视频输入，利用opencv获取视频帧，这里是将原视频中每三帧取一帧来做人脸距离计算。主要过程是将opencv读取的画面做人脸检测和对齐（使用MTCNN网络），再将得到的人脸图像和从emb_img中读取的包含人脸的图像做距离计算，将距离小于阈值的图像对应的标签放到列表。最后就是在视频中绘制人脸框并将对应的标签显示在框上，最终效果如下图：
neural
完整源代码如下：

# video="http://admin:admin@192.168.137.33:8081/"
# capture =cv2.VideoCapture(video)
dirVideo = "video1.mp4"
capture =cv2.VideoCapture(dirVideo)
# capture.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
# capture.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)
capture.set(cv2.CAP_PROP_FPS, 60)

# size =(int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)),int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)))
# fourcc = cv2.VideoWriter_fourcc('M','J','P','G')
# writeVideo = cv2.VideoWriter("aaa.avi", fourcc, 5, size)
# size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
# fourcc = cv2.VideoWriter_fourcc(*'XVID')
# writeVideo = cv2.VideoWriter('output.avi',fourcc, 20, size, 1)
cv2.namedWindow("camera",1)
picNumber = 0
count = 0
frame_interval = 3
while True:
    isSuccess, frame = capture.read() 
    if(count % frame_interval == 0):
        rgb_frame=cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        tag, bounding_box, crop_image, =load_and_align_data(rgb_frame,160,44)
        if(tag):
            feed_dict = { images_placeholder: crop_image, phase_train_placeholder:False }
            emb = sess.run(embeddings, feed_dict=feed_dict)
            print(emb)
            temp_num=len(emb)
            fin_obj=[]
            # calculate distance between camera face and in emd_img face
            for i in range(temp_num):
                dist_list=[]
                for j in range(compare_num):
                    dist = np.sqrt(np.sum(np.square(np.subtract(emb[i,:], compare_emb[j,:]))))
                    
                    dist_list.append(dist)
                min_value=min(dist_list)
                if(min_value>0.65):
                    fin_obj.append('UNKNOW')
                else:
                    fin_obj.append(all_obj[dist_list.index(min_value)][0:6])    #mini distance is face which recongnition
            # draw rectangle
            for rec_position in range(temp_num):                        
                cv2.rectangle(frame,
                                (bounding_box[rec_position,0],
                                bounding_box[rec_position,1]),
                                (bounding_box[rec_position,2],
                                bounding_box[rec_position,3]),
                                (0, 255, 0), 2, 8, 0)
                cv2.putText(frame,
                            fin_obj[rec_position], 
                            (bounding_box[rec_position,0],bounding_box[rec_position,1]),
                            cv2.FONT_HERSHEY_COMPLEX_SMALL, 
                            0.8, 
                            (0, 0 ,255), 
                            thickness = 2, 
                            lineType = 2)
        # writeVideo.write(frame)
        cv2.imshow('camera',frame)
    count += 1
    key = cv2.waitKey(3)
    if key == 27:
        print("ESC break")
        break
    if key == ord(' '):
        picNumber += 1
        # filename = "{}_{}.jpg".format(dirVideo, picNumber)
        filename = "%s_%s.jpg" % (dirVideo, picNumber)
        cv2.imwrite(filename,frame)
capture.release()
cv2.destroyWindow("camera")