数据集拆分,互转,可视化,查错
分享数据集的集中常用处理代码,使用的时候记得改一下自己的路径,ann_dir是coco的json文件路径,img_dir是图片路径。如果用pycharm控制台输出的中文为乱码,将pycharm中的编码全改成utf-8(设置->编辑器->文件编码),把能改成utf-8的选项都改了。
1️⃣ 有些数据集中含有unicode编码,也就是对应的中文,我们记录好每个unicode编码对应的id。然后将文件中的unicode编码转成id。
# -*- coding: utf-8 -*-
import json
import os
import random
import time
import shutil
import glob
category=['无瑕疵','花板跳', '水渍', '星跳', '浆斑', '油渍', '烧毛痕', '死皱', '筘路', '浪纹档', '三丝', '跳纱', '双经', '修痕',
'污渍', '百脚', '松经', '跳花', '吊经', '纬纱不良', '断氨纶', '双纬', '粗维', '磨痕', '云织', '整经结', '稀密档', '断经',
'粗经', '纬缩', '色差档', '毛粒', '破洞', '结头', '轧痕']
root_path=os.getcwd()
ann_dir=os.path.join(root_path,"smartdiagnosisofclothflaw_round1train1_datasets",
"guangdong1_round1_train1_20190818","Annotations")
img_dir=os.path.join(root_path,"smartdiagnosisofclothflaw_round1train1_datasets",
"guangdong1_round1_train1_20190818","defect_Images")
# 训练集比例
train_percent = 0.8
#####################################################################################
##### 数据集中文改英文
#####################################################################################
def unicode2id():
ann_file=os.path.join(ann_dir,"anno_train.json")
print(ann_file)
# # 输出训练数据集中所有的类别
# category_temp=set()
# with open(anno_file, 'r', encoding='unicode_escape') as f:
# json_data = json.load(f)
# for i in json_data:
# category.add(i['defect_name'])
# print(category_temp)
# print(len(category_temp))
data1=[]
# 将数据集中的中文unicode编码,改编成数字id
with open(ann_file, 'r', encoding='unicode_escape') as f:
json_data = json.load(f)
for i in json_data:
data1.append({'name':i['name'],'defect_name':category.index(i['defect_name']),'bbox':i['bbox']})
with open(os.path.join(ann_dir,'data.json'), 'w') as f:
json.dump(data1, f)
2️⃣ COCO数据集划分为train和val
#####################################################################################
##### COCO数据集划分为train,val
#####################################################################################
def coco_dataset_split():
time_start = time.time()
# 建立输出文件夹
if not os.path.exists(os.path.join(root_path, "COCO2017")):
os.makedirs(os.path.join(root_path, "COCO2017"))
if not os.path.exists(os.path.join(root_path, "COCO2017","annotations")):
os.makedirs(os.path.join(root_path,"COCO2017", "annotations"))
if not os.path.exists(os.path.join(root_path, "COCO2017","train2017")):
os.makedirs(os.path.join(root_path,"COCO2017", "train2017"))
if not os.path.exists(os.path.join(root_path,"COCO2017", "val2017")):
os.makedirs(os.path.join(root_path,"COCO2017", "val2017"))
# 保存路径
save_img_train_dir = os.path.join(root_path, "COCO2017", "train2017")
save_img_val_dir = os.path.join(root_path, "COCO2017", "val2017")
save_ann_train_file = os.path.join(root_path, "COCO2017", "annotations", "instances_train2017.json")
save_ann_val_file = os.path.join(root_path, "COCO2017", "annotations", "instances_val2017.json")
# 数据集类别及数量
images_list = os.listdir(img_dir)
images_num = len(images_list)
train_num = int(images_num * train_percent)
val_num=images_num-train_num
train_list = random.sample(images_list, train_num)
val_list = list(set(images_list) - set(train_list))
print("| Images num: ",images_num)
print("| Train num: ",train_num)
print("| Val num: ",val_num)
# 复制图片。
for image_name in train_list:
shutil.copy(os.path.join(img_dir, image_name), os.path.join(save_img_train_dir, image_name))
for image_name in val_list:
shutil.copy(os.path.join(img_dir, image_name), os.path.join(save_img_val_dir, image_name))
ann_path=os.path.join(ann_dir,"anno_train.json")
# 提取annotation
train2017=[]
val2017=[]
with open(ann_path, 'r', encoding='unicode_escape') as fp:
json_data = json.load(fp)
for i in json_data:
if i['name'] not in val_list:
train2017.append({'name':i['name'],'defect_name':category.index(i['defect_name']),'bbox':i['bbox']})
else:
val2017.append({'name':i['name'],'defect_name':category.index(i['defect_name']),'bbox':i['bbox']})
# 写入annotation
with open(save_ann_train_file, 'w') as fp:
json.dump(train2017, fp)
with open(save_ann_val_file, 'w') as fp:
json.dump(val2017, fp)
time_end = time.time()
cost_time=time_end-time_start
print("| Cost time: ",cost_time//60//60,"hour",cost_time//60%60,"min",cost_time%60,"s")
3️⃣ COCO数据集转换成VOC数据集,复制图片比较耗时,耐心等待就行了。为了节省时间,没有可视化复制图片的进度。如果想加,可以百度一下tqdm,加到复制图片的for循环中就可以了。
#####################################################################################
##### coco数据集转换成voc数据集
#####################################################################################
def coco2voc():
from lxml.etree import Element, SubElement, tostring
from xml.dom.minidom import parseString
# 创建保存的文件夹
if not os.path.exists(os.path.join(root_path, "VOCdevkit", "VOC2012")):
os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012"))
os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "Annotations"))
os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets"))
os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets", "Main"))
os.makedirs(os.path.join(root_path, "VOCdevkit", "VOC2012", "JPEGImages"))
# json文件路径
ann_path = os.path.join(ann_dir,"data.json")
ann_file = open(ann_path, "r", encoding='utf-8')
ann_json_list = json.load(ann_file)
save_xml_path = os.path.join(root_path, "VOCdevkit", 'VOC2012', 'Annotations')
# 保存每个图片对应的category以及bbox.
img_names = []
img_bbox_category = {}
for ann in ann_json_list:
# 获取coco数据集中json的信息
img_name = ann['name']
category = ann['defect_name']
bbox = ann['bbox']
if img_name not in img_names:
img_names.append(img_name)
img_bbox_category[img_name] = [{"category":category,"bbox":bbox}]
else:
img_bbox_category[img_name].append({"category":category,"bbox":bbox})
print('| Images start copy.')
# 复制所有的图片到voc数据集中。
for img_name in img_names:
shutil.copy(os.path.join(img_dir, img_name), os.path.join(root_path, "VOCdevkit", 'VOC2012', 'JPEGImages', img_name))
print('| Images copy finish.')
print('| Jsons start transform')
# 第一层循环遍历所有的照片,提出json中所有的信息,并分别放到不同xml文件中。
for img_name in img_bbox_category.keys():
# 获取图片名字
img_name_temp = img_name
root_node = Element('annotation')
node_filename = SubElement(root_node, 'filename')
node_filename.text = img_name_temp
from PIL import Image
node_size = SubElement(root_node, 'size')
node_width = SubElement(node_size, 'width')
node_height = SubElement(node_size, 'height')
img_m = Image.open(os.path.join(img_dir,img_name))
node_width.text = str(img_m.width) # 图片的宽
node_height.text = str(img_m.height) # 图片的高
# 第二层循环遍历有多少个框
for bbox_and_category in img_bbox_category[img_name_temp]:
category_temp = bbox_and_category["category"]
bbox_temp = bbox_and_category["bbox"]
# 类别名字
node_object = SubElement(root_node, 'object')
node_name = SubElement(node_object, 'name')
node_name.text = str(category_temp)
node_bndbox = SubElement(node_object, 'bndbox')
node_xmin = SubElement(node_bndbox, 'xmin')
node_xmin.text = str(bbox_temp[0])
node_ymin = SubElement(node_bndbox, 'ymin')
node_ymin.text = str(bbox_temp[1])
node_xmax = SubElement(node_bndbox, 'xmax')
node_xmax.text = str(bbox_temp[2])
node_ymax = SubElement(node_bndbox, 'ymax')
node_ymax.text = str(bbox_temp[3])
xml = tostring(root_node)
dom = parseString(xml)
# print xml 打印查看结果
img_name_temp = img_name_temp.replace(".jpg", "")
xml_name = os.path.join(save_xml_path, img_name_temp+'.xml')
with open(xml_name, 'wb') as f:
f.write(dom.toprettyxml(indent='\t', encoding='utf-8'))
# f.write(dom.toprettyxml(indent='\t',))
print('| Jsons transform finish.')
4️⃣ voc数据集转换成coco数据集
#####################################################################################
##### voc数据集转换成coco数据集
#####################################################################################
def voc2coco():
import datetime
from PIL import Image
# 处理coco数据集中category字段。
# 创建一个 {类名 : id} 的字典,并保存到 总标签data 字典中。
class_name_to_id = {'class1':1, 'class2':2, 'class3':3, 'class4':4, 'class5':5, 'class6':6, 'class7':7, 'class8':8}
# 创建coco的文件夹
if not os.path.exists(os.path.join(root_path, "coco2017")):
os.makedirs(os.path.join(root_path, "coco2017"))
os.makedirs(os.path.join(root_path, "coco2017", "annotations"))
os.makedirs(os.path.join(root_path, "coco2017", "train2017"))
os.makedirs(os.path.join(root_path, "coco2017", "val2017"))
# 创建 总标签data
now = datetime.datetime.now()
data = dict(
info=dict(
description=None,
url=None,
version=None,
year=now.year,
contributor=None,
date_created=now.strftime("%Y-%m-%d %H:%M:%S.%f"),
),
licenses=[dict(url=None, id=0, name=None, )],
images=[
# license, file_name,url, height, width, date_captured, id
],
type="instances",
annotations=[
# segmentation, area, iscrowd, image_id, bbox, category_id, id
],
categories=[
# supercategory, id, name
],
)
for name,id in class_name_to_id.items():
data["categories"].append(
dict(supercategory=None, id=id, name=name, )
)
# 处理coco数据集train中images字段。
images_dir=os.path.join(root_path,'VOCdevkit','VOC2012','JPEGImages')
images=os.listdir(images_dir)
# 生成每个图片对应的image_id
images_id={}
for idx,image_name in enumerate(images):
images_id.update({image_name[:-4]:idx})
# 获取训练图片
train_img=[]
fp = open(os.path.join(root_path,'VOCdevkit','VOC2012','ImageSets','Main','train.txt'))
for i in fp.readlines():
train_img.append(i[:-1]+".jpg")
# 获取训练图片的数据
for image in train_img:
img = Image.open(os.path.join(images_dir,image))
data["images"].append(
dict(
license=0,
url=None,
file_name=image, # 图片的文件名带后缀
height=img.height,
width=img.width,
date_captured=None,
# id=image[:-4],
id=images_id[image[:-4]],
)
)
# 获取coco数据集train中annotations字段。
train_xml=[i[:-4]+'.xml' for i in train_img]
bbox_id=0
for xml in train_xml:
category = []
xmin = []
ymin = []
xmax = []
ymax = []
import xml.etree.ElementTree as ET
tree = ET.parse(os.path.join(root_path,'VOCdevkit','VOC2012','Annotations',xml))
root = tree.getroot()
object = root.findall('object')
for i in object:
category.append(class_name_to_id[i.findall('name')[0].text])
bndbox = i.findall('bndbox')
for j in bndbox:
xmin.append(float(j.findall('xmin')[0].text))
ymin.append(float(j.findall('ymin')[0].text))
xmax.append(float(j.findall('xmax')[0].text))
ymax.append(float(j.findall('ymax')[0].text))
for i in range(len(category)):
data["annotations"].append(
dict(
id=bbox_id,
image_id=images_id[xml[:-4]],
category_id=category[i],
area=(xmax[i]-xmin[i])*(ymax[i]-ymin[i]),
bbox=[xmin[i],ymin[i],xmax[i]-xmin[i],ymax[i]-ymin[i]],
iscrowd=0,
)
)
bbox_id+=1
# 生成训练集的json
json.dump(data, open(os.path.join(root_path,'coco2017','annotations','instances_train2017.json'), 'w'))
# 获取验证图片
val_img = []
fp = open(os.path.join(root_path, 'VOCdevkit', 'VOC2012', 'ImageSets', 'Main', 'val.txt'))
for i in fp.readlines():
val_img.append(i[:-1] + ".jpg")
# 将训练的images和annotations清空,
del data['images']
data['images']=[]
del data['annotations']
data['annotations']=[]
# 获取验证集图片的数据
for image in val_img:
img = Image.open(os.path.join(images_dir, image))
data["images"].append(
dict(
license=0,
url=None,
file_name=image, # 图片的文件名带后缀
height=img.height,
width=img.width,
date_captured=None,
id=images_id[image[:-4]],
)
)
# 处理coco数据集验证集中annotations字段。
val_xml=[i[:-4]+'.xml' for i in val_img]
for xml in val_xml:
category = []
xmin = []
ymin = []
xmax = []
ymax = []
import xml.etree.ElementTree as ET
tree = ET.parse(os.path.join(root_path,'VOCdevkit','VOC2012','Annotations',xml))
root = tree.getroot()
object = root.findall('object')
for i in object:
category.append(class_name_to_id[i.findall('name')[0].text])
bndbox = i.findall('bndbox')
for j in bndbox:
xmin.append(float(j.findall('xmin')[0].text))
ymin.append(float(j.findall('ymin')[0].text))
xmax.append(float(j.findall('xmax')[0].text))
ymax.append(float(j.findall('ymax')[0].text))
for i in range(len(category)):
data["annotations"].append(
dict(
id=bbox_id,
image_id=images_id[xml[:-4]],
category_id=category[i],
area=(xmax[i]-xmin[i])*(ymax[i]-ymin[i]),
bbox=[xmin[i],ymin[i],xmax[i]-xmin[i],ymax[i]-ymin[i]],
iscrowd=0,
)
)
bbox_id+=1
# 生成验证集的json
json.dump(data, open(os.path.join(root_path,'coco2017','annotations','instances_val2017.json'), 'w'))
print('| VOC -> COCO annotations transform finish.')
print('Start copy images...')
for img_name in train_img:
shutil.copy(os.path.join(root_path,"VOCdevkit", "VOC2012", "JPEGImages", img_name), os.path.join(root_path, "coco2017", 'train2017', img_name))
print('| Train images copy finish.')
for img_name in val_img:
shutil.copy(os.path.join(root_path,"VOCdevkit", "VOC2012", "JPEGImages", img_name), os.path.join(root_path, "coco2017", 'val2017', img_name))
print('| Val images copy finish.')
5️⃣ VOC数据集划分为train和val
#####################################################################################
##### voc数据集划分为train,val
#####################################################################################
def voc_dataset_split():
file_train = open(
os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets", "Main", "train.txt"), 'w')
file_val = open(
os.path.join(root_path, "VOCdevkit", "VOC2012", "ImageSets", "Main", "val.txt"), 'w')
xml_total_filename = glob.glob(os.path.join(root_path, "VOCdevkit", 'VOC2012', 'Annotations', "*.xml"))
for idx,xml in enumerate(xml_total_filename):
xml_total_filename[idx]=xml.split('\\')[-1]
num_total = len(xml_total_filename)
num_train = int(num_total*train_percent)
train_sample = random.sample(xml_total_filename, num_train)
for name in xml_total_filename:
if name in train_sample:
file_train.write(name[:-4]+'\n')
else:
file_val.write(name[:-4]+'\n')
file_train.close()
file_val.close()
6️⃣ 检查数据集中图片是否有损坏
#####################################################################################
##### OSError: image file is truncated (9 bytes not processed)
##### 检查数据集中图片是否有损坏。找到有问题图片,删掉它,并修改数据集。
#####################################################################################
def check_images():
from PIL import Image
images_dir=os.path.join(root_path,'VOCdevkit','VOC2012','JPEGImages')
images=os.listdir(images_dir)
for i in images:
try:
img = Image.open(os.path.join(root_path,'VOCdevkit','VOC2012','JPEGImages',i)) # 如果图片不存在,报错FileNotFoundError
img.load() # 如果图片不完整,报错OSError: image file is truncated
except (FileNotFoundError, OSError):
print(i)
7️⃣ coco数据集将gt可视化,查看
#####################################################################################
# ##### coco数据集将gt可视化,查看
# #####################################################################################
def visiual_gt():
import cv2
# 获取bboxes
json_file = os.path.join(root_path,'COCO2017','annotations','instances_train2017.json')
data = json.load(open(json_file, 'r'))
# annotations = data['annotations']
images=[]
for d in data:
images.append(d['name'])
# 读取图片
for i in random.sample(range(len(images)),5):
img = cv2.imread(os.path.join(root_path,'COCO2017','train2017',images[i]))
bboxes = [] # 获取每个图片的bboxes
for d in data:
if d['name']==images[i]:
bboxes.append(d["bbox"])
# 生成锚框
for bbox in bboxes:
left_top = (int(bbox[0]), int(bbox[1])) # 这里数据集中bbox的含义是,左上角坐标和右下角坐标。
right_bottom = (int(bbox[2]), int(bbox[3])) # 根据不同数据集中bbox的含义,进行修改。
cv2.rectangle(img, left_top, right_bottom, (0, 255, 0), 2) # 图像,左上角,右下坐标,颜色,粗细
cv2.imshow('image', img)
cv2.waitKey(0)
cv2.destroyAllWindows()
博客所有的代码,都同一放到一个python文件中,用那个就调用那个文件。
if __name__ == '__main__':
random.seed(777)
print("—" * 50)
# unicode2id() # 数据集unicode编码转id
# coco_dataset_split() # coco数据集拆分。
# coco2voc() # coco数据集转换成voc数据集
# voc_dataset_split() # voc数据集拆分
# check_images() # 检查图片是否有损坏
# visiual_gt() # coco数据集将gt可视化,查看
voc2coco() # voc数据集转换成coco数据集
print("—" * 50)
⭐ 完结撒花,如果有需要帮助的评论或者私聊都可以,看到就回答了。