import os
from PIL import Image
from array import *
from random import shuffle
# # 文件组织架构:
# ├──training-images
# │ └──0(类别为0的图像)
# │ ├──1(类别为1的图像)
# │ ├──2(类别为2的图像)
# │ ├──3(类别为3的图像)
# │ └──4(类别为4的图像)
# ├──test-images
# │ └──0(类别为0的图像)
# │ ├──1(类别为1的图像)
# │ ├──2(类别为2的图像)
# │ ├──3(类别为3的图像)
# │ └──4(类别为4的图像)
# └── mnist数据集制作.py(本脚本)
# Load from and save to
Names = [['./training-images', 'train'], ['./test-images', 'test']]
for name in Names:
data_image = array('B')
data_label = array('B')
print(os.listdir(name[0]))
FileList = []
for dirname in os.listdir(name[0])[0:]: # [1:] Excludes .DS_Store from Mac OS
# print(dirname)
path = os.path.join(name[0], dirname)
# print(path)
for filename in os.listdir(path):
# print(filename)
if filename.endswith(".png"):
FileList.append(os.path.join(name[0] + '/', dirname + '/', filename))
print(FileList)
shuffle(FileList) # Usefull for further segmenting the validation set
for filename in FileList:
label = int(filename.split('/')[2])
print(filename)
Im = Image.open(filename)
# print(Im)
pixel = Im.load()
width, height = Im.size
for x in range(0, width):
for y in range(0, height):
data_image.append(pixel[y, x])
data_label.append(label) # labels start (one unsigned byte each)
hexval = "{0:#0{1}x}".format(len(FileList), 6) # number of files in HEX
# header for label array
header = array('B')
header.extend([0, 0, 8, 1, 0, 0])
header.append(int('0x' + hexval[2:][:2], 16))
header.append(int('0x' + hexval[2:][2:], 16))
data_label = header + data_label
# additional header for images array
if max([width, height]) <= 256:
header.extend([0, 0, 0, width, 0, 0, 0, height])
else:
raise ValueError('Image exceeds maximum size: 256x256 pixels');
header[3] = 3 # Changing MSB for image data (0x00000803)
data_image = header + data_image
output_file = open(name[1] + '-images-idx3-ubyte', 'wb')
data_image.tofile(output_file)
output_file.close()
output_file = open(name[1] + '-labels-idx1-ubyte', 'wb')
data_label.tofile(output_file)
output_file.close()
# 运行脚本得到四个文件test-images-idx3-ubyte、test-labels-idx1-ubyte、train-images-idx3-ubyte、train-labels-idx1-ubyte
# 在cmd中利用gzip -c train-labels-idx1-ubyte > train-labels-idx1-ubyte.gz命令对上述四个文件压缩得到最终的mnist格式数据集