将dlib xml数据转换成YOLOv3 数据
dlib 的训练数据是一个测试文件夹和一个训练文件夹,分别放着若干图片和一个xml文件,xml文件保存了对应图片的标注信息。
dlibData:
+---test
| 1.jpg
| 10.jpg
| 11.jpg
| ...
| 56.jpg
| 57.jpg
| test.xml
|
\---train
1.jpg
...
95.jpg
96.jpg
97.jpg
98.jpg
99.jpg
train.xml
Yolo的训练数据结构如下:
YOLOData:.
| classes.names # 类别名称
| test.txt # 验证的图片路径
| train.txt # 训练的图片路径
| SplitData.py # 脚本文件, 对labels的标注文件**对应的图片**进行划分得到train.txt 和 test.txt (由于xml文件有些图片没有标注,但是labels的标注文件中肯定有标注)
| xml2txt.py # 将train(test)下的图片转到JPEGImages中并随机命名, train.xml(test) 转成labels中的标注文件,图片对应标注文件
| YOLOData.data # 保存训练和验证的路径。。etc
|
+---JPEGImages
| 0eQ2ARay.jpg
| 0HzMbDSE.jpg
| 0K7SYueV.jpg
| ...
| 0TIf1aij.jpg
| 10QmnWfi.jpg
| 1bVJ5Zkl.jpg
|
+---labels
| 0eQ2ARay.txt
| 0HzMbDSE.txt
| 0K7SYueV.txt
| ...
| Zi2Ec8Tt.txt
| znvQ045k.txt
| zOvPyFtR.txt
| ZViHLeBs.txt
|
+---TestYOLOData # 测试训练结果
| darknet-yolov3.cfg # yolo配置文件
| img2video.py # 将图片转化成视频的工具
| object_detection_yolo.py # 从视频中进行目标检测
| test.avi # 由图片生成的视频
| test_yolo_out_py.avi # 视频输出结果
|
+---weights
| darknet-yolov3_final.weights # 训练得到的权重文件
|
+---test # dlib 数据格式的验证集
| 1.jpg
| 10.jpg
| 11.jpg
| ...
| 56.jpg
| 57.jpg
| test.xml
|
\---train # dlib 数据格式的训练集
1.jpg
...
95.jpg
96.jpg
97.jpg
98.jpg
99.jpg
train.xml
xml2txt.py
'''
dlib .xml file to yolo .txt file
python xml2txt.py dlib_train_path dlib_test_path
example:
python xml2txt.py /home/hichens/YOLOData/train/ /home/hichens/YOLOData/test/
'''
import cv2
import os
import subprocess
import sys
import random
import string
train_path = sys.argv[1]
test_path = sys.argv[2]
file_path = "/".join(train_path.split("/")[:-2])
subprocess.run(['rm', '-rf', file_path + "/JPEGImages/"])
subprocess.run(['mkdir', "JPEGImages"])
subprocess.run(['rm', '-rf', file_path + "/labels/"])
subprocess.run(['mkdir', "labels"])
def xml2txt(xml_path):
base_path = "/".join(xml_path.split("/")[:-2])
I_path = "/".join(xml_path.split("/")[:-1])
with open(xml_path, 'r') as f:
for line in f:
ss = line.split()
if(len(ss) < 1):
pass
else:
if(ss[0] == "<image"):
img_name = line.split("'")[1]
print(img_name)
if(ss[0] == "<box"):
ll = line.split("'")
top, left, width, height = int(ll[1]), int(ll[3]), int(ll[5]), int(ll[7])
img_path = I_path + '/' + img_name # image int the xieshi_train or xieshi_test
move_path = base_path + "/JPEGImages/" + img_name
subprocess.run(['cp', img_path, move_path]) # move the image to JPEGImages
add_label = ''.join(random.sample(string.ascii_letters + string.digits, 8))
new_name = base_path + "/JPEGImages/" + add_label + '.jpg'
os.rename(move_path, new_name) # rename the imgage in the JPEGImages
img = cv2.imread(img_path)
H, W = img.shape[:2]
x_center, y_center = (left+width / 2) / W, (top+height / 2) / H
w, h = width / W, height / H
print(x_center, y_center, w, h)
file_name = base_path + "/labels/" + add_label +".txt" # accoding to image name in the JPEGImages name the txt
with open(file_name, 'w') as file:
sentence = " ".join(str(i) for i in [0, x_center, y_center, w, h])
file.write(sentence)
if __name__ == "__main__":
xml2txt(train_path + "train.xml")
xml2txt(test_path + "test.xml")
SplitData.py
'''
from labels to split the data into train data and validation data
python SplitData.py /home/hichens/YOLOData/
'''
import random
import os
import subprocess
import sys
def split_data_set(base_path):
label_dir = base_path + 'labels'
image_dir = base_path + 'JPEGImages'
f_val = open("eye_test.txt", 'w')
f_train = open("eye_train.txt", 'w')
path, dirs, files = next(os.walk(label_dir))
data_size = len(files)
ind = 0
data_test_size = int(0.1 * data_size)
test_array = random.sample(range(data_size), k=data_test_size)
for f in os.listdir(label_dir):
if(f.split(".")[1] == "txt"):
ind += 1
file_name = f.split(".")[0] + '.jpg'
if ind in test_array:
f_val.write(image_dir+'/'+file_name+'\n')
else:
f_train.write(image_dir+'/'+file_name+'\n')
if __name__ == "__main__":
split_data_set(sys.argv[1])
img2video.py
'''
combine the images to video
python img2video.py image_path
exmpale:
python img2video.py /home/hichens/YOLOData/test/
'''
# encoding: UTF-8
import glob as gb
import cv2
import sys
in_path = sys.argv[1]
img_path = gb.glob(in_path + "*")
fps = 4 # the bigger the value is, the faster is the video.
size = (640,480) # the image size
videoWriter = cv2.VideoWriter('test.avi',
cv2.VideoWriter_fourcc('I','4','2','0'), fps, size)
step = len(img_path) // 30
print("[", end="")
for i, path in enumerate(img_path):
if(i % step == 0):
img = cv2.imread(path)
img = cv2.resize(img,(640,480))
print(">", end="")
videoWriter.write(img)
print("]")
print("OK!")
object_detection_yolo.py
'''
test the training result
example:
python object_detection_yolo.py --video=test.avi
python object_detection_yolo.py --image=bird.jpg
'''
import cv2 as cv
import argparse
import sys
import numpy as np
import os.path
# Initialize the parameters
confThreshold = 0.5 #Confidence threshold
nmsThreshold = 0.4 #Non-maximum suppression threshold
inpWidth = 416 #608 #Width of network's input image
inpHeight = 416 #608 #Height of network's input image
parser = argparse.ArgumentParser(description='Object Detection using YOLO in OPENCV')
parser.add_argument('--image', help='Path to image file.')
parser.add_argument('--video', help='Path to video file.')
args = parser.parse_args()
# Load names of classes
classesFile = "classes.names";
classes = None
with open(classesFile, 'rt') as f:
classes = f.read().rstrip('\n').split('\n')
# Give the configuration and weight files for the model and load the network using them.
modelConfiguration = "darknet-yolov3.cfg";
modelWeights = "../weights/darknet-yolov3_800.weights";
net = cv.dnn.readNetFromDarknet(modelConfiguration, modelWeights)
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
# Get the names of the output layers
def getOutputsNames(net):
# Get the names of all the layers in the network
layersNames = net.getLayerNames()
# Get the names of the output layers, i.e. the layers with unconnected outputs
return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()]
# Draw the predicted bounding box
def drawPred(classId, conf, left, top, right, bottom):
# Draw a bounding box.
# cv.rectangle(frame, (left, top), (right, bottom), (255, 178, 50), 3)
cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0), 3)
label = '%.2f' % conf
# Get the label for the class name and its confidence
if classes:
assert(classId < len(classes))
label = '%s:%s' % (classes[classId], label)
#Display the label at the top of the bounding box
labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
top = max(top, labelSize[1])
cv.rectangle(frame, (left, top - round(1.5*labelSize[1])), (left + round(1.5*labelSize[0]), top + baseLine), (0, 0, 255), cv.FILLED)
#cv.rectangle(frame, (left, top - round(1.5*labelSize[1])), (left + round(1.5*labelSize[0]), top + baseLine), (255, 255, 255), cv.FILLED)
cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.75, (0,0,0), 2)
# Remove the bounding boxes with low confidence using non-maxima suppression
def postprocess(frame, outs):
frameHeight = frame.shape[0]
frameWidth = frame.shape[1]
classIds = []
confidences = []
boxes = []
# Scan through all the bounding boxes output from the network and keep only the
# ones with high confidence scores. Assign the box's class label as the class with the highest score.
classIds = []
confidences = []
boxes = []
for out in outs:
print("out.shape : ", out.shape)
for detection in out:
#if detection[4]>0.001:
scores = detection[5:]
classId = np.argmax(scores)
#if scores[classId]>confThreshold:
confidence = scores[classId]
if detection[4]>confThreshold:
print(detection[4], " - ", scores[classId], " - th : ", confThreshold)
print(detection)
if confidence > confThreshold:
center_x = int(detection[0] * frameWidth)
center_y = int(detection[1] * frameHeight)
width = int(detection[2] * frameWidth)
height = int(detection[3] * frameHeight)
left = int(center_x - width / 2)
top = int(center_y - height / 2)
classIds.append(classId)
confidences.append(float(confidence))
boxes.append([left, top, width, height])
# Perform non maximum suppression to eliminate redundant overlapping boxes with
# lower confidences.
indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold)
for i in indices:
i = i[0]
box = boxes[i]
left = box[0]
top = box[1]
width = box[2]
height = box[3]
drawPred(classIds[i], confidences[i], left, top, left + width, top + height)
# Process inputs
winName = 'Deep learning object detection in OpenCV'
cv.namedWindow(winName, cv.WINDOW_NORMAL)
outputFile = "yolo_out_py.avi"
if (args.image):
# Open the image file
if not os.path.isfile(args.image):
print("Input image file ", args.image, " doesn't exist")
sys.exit(1)
cap = cv.VideoCapture(args.image)
outputFile = args.image[:-4]+'_yolo_out_py.jpg'
elif (args.video):
# Open the video file
if not os.path.isfile(args.video):
print("Input video file ", args.video, " doesn't exist")
sys.exit(1)
cap = cv.VideoCapture(args.video)
outputFile = args.video[:-4]+'_yolo_out_py.avi'
else:
# Webcam input
cap = cv.VideoCapture(0)
# Get the video writer initialized to save the output video
if (not args.image):
vid_writer = cv.VideoWriter(outputFile,
cv.VideoWriter_fourcc('M','J','P','G'),
4,
(round(cap.get(cv.CAP_PROP_FRAME_WIDTH)),round(cap.get(cv.CAP_PROP_FRAME_HEIGHT))))
while cv.waitKey(1) < 0:
# get frame from the video
hasFrame, frame = cap.read()
# Stop the program if reached end of video
if not hasFrame:
print("Done processing !!!")
print("Output file is stored as ", outputFile)
cv.waitKey(3000)
break
# Create a 4D blob from a frame.
blob = cv.dnn.blobFromImage(frame, 1/255, (inpWidth, inpHeight), [0,0,0], 1, crop=False)
# Sets the input to the network
net.setInput(blob)
# Runs the forward pass to get output of the output layers
outs = net.forward(getOutputsNames(net))
# Remove the bounding boxes with low confidence
postprocess(frame, outs)
# Put efficiency information. The function getPerfProfile returns the overall time for inference(t) and the timings for each of the layers(in layersTimes)
t, _ = net.getPerfProfile()
label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
#cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))
# Write the frame with the detection boxes
if (args.image):
cv.imwrite(outputFile, frame.astype(np.uint8));
else:
vid_writer.write(frame.astype(np.uint8))
cv.imshow(winName, frame)