将dlib xml数据转换成YOLOv3 数据

dlib 的训练数据是一个测试文件夹和一个训练文件夹，分别放着若干图片和一个xml文件，xml文件保存了对应图片的标注信息。

dlibData:
+---test
|       1.jpg
|       10.jpg
|       11.jpg
|       ...
|       56.jpg
|       57.jpg
|       test.xml
|       
---train
        1.jpg
 	...
        95.jpg
        96.jpg
        97.jpg
        98.jpg
        99.jpg
        train.xml

Yolo的训练数据结构如下：

YOLOData:.
|   classes.names # 类别名称
|   test.txt # 验证的图片路径
|   train.txt # 训练的图片路径
|   SplitData.py # 脚本文件， 对labels的标注文件**对应的图片**进行划分得到train.txt 和 test.txt （由于xml文件有些图片没有标注，但是labels的标注文件中肯定有标注）
|   xml2txt.py # 将train（test）下的图片转到JPEGImages中并随机命名， train.xml(test) 转成labels中的标注文件，图片对应标注文件
|   YOLOData.data # 保存训练和验证的路径。。etc
|   
+---JPEGImages
|       0eQ2ARay.jpg
|       0HzMbDSE.jpg
|       0K7SYueV.jpg
|       ...
|       0TIf1aij.jpg
|       10QmnWfi.jpg
|       1bVJ5Zkl.jpg

|       
+---labels
|       0eQ2ARay.txt
|       0HzMbDSE.txt
|       0K7SYueV.txt
|       ...
|       Zi2Ec8Tt.txt
|       znvQ045k.txt
|       zOvPyFtR.txt
|       ZViHLeBs.txt
|       
+---TestYOLOData # 测试训练结果
|       darknet-yolov3.cfg # yolo配置文件
|       img2video.py # 将图片转化成视频的工具
|       object_detection_yolo.py # 从视频中进行目标检测
|       test.avi # 由图片生成的视频
|       test_yolo_out_py.avi # 视频输出结果
|       
+---weights
|       darknet-yolov3_final.weights # 训练得到的权重文件
|       
+---test # dlib 数据格式的验证集
|       1.jpg
|       10.jpg
|       11.jpg
|       ...
|       56.jpg
|       57.jpg
|       test.xml
|       
---train # dlib 数据格式的训练集
        1.jpg
 	...
        95.jpg
        96.jpg
        97.jpg
        98.jpg
        99.jpg
        train.xml

xml2txt.py

'''
dlib .xml file to yolo .txt file 

python xml2txt.py dlib_train_path dlib_test_path 

example:
	python xml2txt.py /home/hichens/YOLOData/train/ /home/hichens/YOLOData/test/ 
'''

import cv2
import os 
import subprocess
import sys 
import random 
import string

train_path = sys.argv[1]
test_path = sys.argv[2]
file_path = "/".join(train_path.split("/")[:-2])
subprocess.run(['rm', '-rf',  file_path + "/JPEGImages/"])
subprocess.run(['mkdir', "JPEGImages"])
subprocess.run(['rm', '-rf',  file_path + "/labels/"])
subprocess.run(['mkdir', "labels"])

def xml2txt(xml_path):
    base_path = "/".join(xml_path.split("/")[:-2])
    I_path = "/".join(xml_path.split("/")[:-1])
    with open(xml_path, 'r') as f:
        for line in f:
            ss = line.split()
            if(len(ss) < 1):
                pass
            else:
                if(ss[0] == "<image"):
                    img_name = line.split("'")[1]
                    print(img_name)
                if(ss[0] == "<box"):
                    ll = line.split("'")
                    top, left, width, height = int(ll[1]), int(ll[3]), int(ll[5]), int(ll[7])
                    img_path = I_path + '/' + img_name # image int the xieshi_train or xieshi_test
                    move_path = base_path + "/JPEGImages/" + img_name 
                    subprocess.run(['cp', img_path, move_path]) # move the image to JPEGImages
                    add_label = ''.join(random.sample(string.ascii_letters + string.digits, 8))
                    new_name =  base_path + "/JPEGImages/" + add_label + '.jpg'
                    os.rename(move_path, new_name) # rename the imgage in the JPEGImages

                    img = cv2.imread(img_path)
                    H, W = img.shape[:2]
                    x_center, y_center =  (left+width / 2)  / W, (top+height / 2) / H
                    w, h = width / W, height / H
                    print(x_center, y_center, w, h)
                    file_name = base_path + "/labels/" + add_label +".txt" # accoding to image name in the JPEGImages name the txt
                    with open(file_name, 'w') as file:
                        sentence = " ".join(str(i) for i in [0, x_center, y_center, w, h])
                        file.write(sentence)
            
if __name__ == "__main__":
    xml2txt(train_path + "train.xml")
    xml2txt(test_path + "test.xml")

SplitData.py

'''
from labels to split the data into train data and validation data 

python SplitData.py /home/hichens/YOLOData/

'''


import random
import os
import subprocess
import sys

def split_data_set(base_path):
    label_dir = base_path + 'labels'
    image_dir = base_path + 'JPEGImages'
    f_val = open("eye_test.txt", 'w')
    f_train = open("eye_train.txt", 'w')
    
    path, dirs, files = next(os.walk(label_dir))
    data_size = len(files)

    ind = 0
    data_test_size = int(0.1 * data_size)
    test_array = random.sample(range(data_size), k=data_test_size)
    
    for f in os.listdir(label_dir):
        if(f.split(".")[1] == "txt"):
            ind += 1
            file_name = f.split(".")[0] + '.jpg'
            if ind in test_array:
                f_val.write(image_dir+'/'+file_name+'
')
            else:
                f_train.write(image_dir+'/'+file_name+'
')

if __name__ == "__main__":
    split_data_set(sys.argv[1])

img2video.py

'''
combine the images to video 

python img2video.py image_path

exmpale:
	python img2video.py /home/hichens/YOLOData/test/ 
'''
# encoding: UTF-8
import glob as gb
import cv2
import sys
in_path = sys.argv[1]

img_path = gb.glob(in_path + "*")
fps  = 4 # the bigger the value is, the faster is the video.
size =  (640,480) # the image size
videoWriter = cv2.VideoWriter('test.avi', 
                              cv2.VideoWriter_fourcc('I','4','2','0'), fps, size)

step = len(img_path) // 30
print("[", end="")
for i, path in enumerate(img_path):
    if(i % step == 0):
        img  = cv2.imread(path) 
        img = cv2.resize(img,(640,480))
        print(">", end="")
    videoWriter.write(img)
print("]")
print("OK!")

object_detection_yolo.py

'''
test the training result 

example:  
    python object_detection_yolo.py --video=test.avi
    python object_detection_yolo.py --image=bird.jpg

'''

import cv2 as cv
import argparse
import sys
import numpy as np
import os.path

# Initialize the parameters
confThreshold = 0.5  #Confidence threshold
nmsThreshold = 0.4  #Non-maximum suppression threshold

inpWidth = 416  #608     #Width of network's input image
inpHeight = 416 #608     #Height of network's input image

parser = argparse.ArgumentParser(description='Object Detection using YOLO in OPENCV')
parser.add_argument('--image', help='Path to image file.')
parser.add_argument('--video', help='Path to video file.')
args = parser.parse_args()
        
# Load names of classes
classesFile = "classes.names";

classes = None
with open(classesFile, 'rt') as f:
    classes = f.read().rstrip('
').split('
')

# Give the configuration and weight files for the model and load the network using them.

modelConfiguration = "darknet-yolov3.cfg";
modelWeights = "../weights/darknet-yolov3_800.weights"; 

net = cv.dnn.readNetFromDarknet(modelConfiguration, modelWeights)
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)

# Get the names of the output layers
def getOutputsNames(net):
    # Get the names of all the layers in the network
    layersNames = net.getLayerNames()
    # Get the names of the output layers, i.e. the layers with unconnected outputs
    return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()]

# Draw the predicted bounding box
def drawPred(classId, conf, left, top, right, bottom):
    # Draw a bounding box.
    #    cv.rectangle(frame, (left, top), (right, bottom), (255, 178, 50), 3)
    cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0), 3)

    label = '%.2f' % conf
        
    # Get the label for the class name and its confidence
    if classes:
        assert(classId < len(classes))
        label = '%s:%s' % (classes[classId], label)

    #Display the label at the top of the bounding box
    labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
    top = max(top, labelSize[1])
    cv.rectangle(frame, (left, top - round(1.5*labelSize[1])), (left + round(1.5*labelSize[0]), top + baseLine), (0, 0, 255), cv.FILLED)
    #cv.rectangle(frame, (left, top - round(1.5*labelSize[1])), (left + round(1.5*labelSize[0]), top + baseLine),    (255, 255, 255), cv.FILLED)
    cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.75, (0,0,0), 2)

# Remove the bounding boxes with low confidence using non-maxima suppression
def postprocess(frame, outs):
    frameHeight = frame.shape[0]
    frameWidth = frame.shape[1]

    classIds = []
    confidences = []
    boxes = []
    # Scan through all the bounding boxes output from the network and keep only the
    # ones with high confidence scores. Assign the box's class label as the class with the highest score.
    classIds = []
    confidences = []
    boxes = []
    for out in outs:
        print("out.shape : ", out.shape)
        for detection in out:
            #if detection[4]>0.001:
            scores = detection[5:]
            classId = np.argmax(scores)
            #if scores[classId]>confThreshold:
            confidence = scores[classId]
            if detection[4]>confThreshold:
                print(detection[4], " - ", scores[classId], " - th : ", confThreshold)
                print(detection)
            if confidence > confThreshold:
                center_x = int(detection[0] * frameWidth)
                center_y = int(detection[1] * frameHeight)
                width = int(detection[2] * frameWidth)
                height = int(detection[3] * frameHeight)
                left = int(center_x - width / 2)
                top = int(center_y - height / 2)
                classIds.append(classId)
                confidences.append(float(confidence))
                boxes.append([left, top, width, height])

    # Perform non maximum suppression to eliminate redundant overlapping boxes with
    # lower confidences.
    indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold)
    for i in indices:
        i = i[0]
        box = boxes[i]
        left = box[0]
        top = box[1]
        width = box[2]
        height = box[3]
        drawPred(classIds[i], confidences[i], left, top, left + width, top + height)

# Process inputs
winName = 'Deep learning object detection in OpenCV'
cv.namedWindow(winName, cv.WINDOW_NORMAL)

outputFile = "yolo_out_py.avi"
if (args.image):
    # Open the image file
    if not os.path.isfile(args.image):
        print("Input image file ", args.image, " doesn't exist")
        sys.exit(1)
    cap = cv.VideoCapture(args.image)
    outputFile = args.image[:-4]+'_yolo_out_py.jpg'
elif (args.video):
    # Open the video file
    if not os.path.isfile(args.video):
        print("Input video file ", args.video, " doesn't exist")
        sys.exit(1)
    cap = cv.VideoCapture(args.video)
    outputFile = args.video[:-4]+'_yolo_out_py.avi'
else:
    # Webcam input
    cap = cv.VideoCapture(0)

# Get the video writer initialized to save the output video
if (not args.image):
    vid_writer = cv.VideoWriter(outputFile, 
                                cv.VideoWriter_fourcc('M','J','P','G'), 
                                4, 
                                (round(cap.get(cv.CAP_PROP_FRAME_WIDTH)),round(cap.get(cv.CAP_PROP_FRAME_HEIGHT))))

while cv.waitKey(1) < 0:
    
    # get frame from the video
    hasFrame, frame = cap.read()
    
    # Stop the program if reached end of video
    if not hasFrame:
        print("Done processing !!!")
        print("Output file is stored as ", outputFile)
        cv.waitKey(3000)
        break

    # Create a 4D blob from a frame.
    blob = cv.dnn.blobFromImage(frame, 1/255, (inpWidth, inpHeight), [0,0,0], 1, crop=False)

    # Sets the input to the network
    net.setInput(blob)

    # Runs the forward pass to get output of the output layers
    outs = net.forward(getOutputsNames(net))

    # Remove the bounding boxes with low confidence
    postprocess(frame, outs)

    # Put efficiency information. The function getPerfProfile returns the overall time for inference(t) and the timings for each of the layers(in layersTimes)
    t, _ = net.getPerfProfile()
    label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
    #cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))

    # Write the frame with the detection boxes
    if (args.image):
        cv.imwrite(outputFile, frame.astype(np.uint8));
    else:
        vid_writer.write(frame.astype(np.uint8))

    cv.imshow(winName, frame)