相信了解机器学习的对MNIST不会陌生,Google的工程师Yaroslav Bulatov 创建了notMNIST,它和MNIST类似,图像28x28,也有10个Label(A-J)。
在Tensorflow中已经封装好了读取MNIST数据集的函数 read_data_sets(),
from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets mnist = read_data_sets("data", one_hot=True, reshape=False, validation_size=0)
但是由于notMNIST的格式和MNIST的格式不是完全相同,所以基于tensorflow创建的针对MNIST的模型并不能直接读取notMNIST的图片。
Github上有人编写了格式转换代码(https://github.com/davidflanagan/notMNIST-to-MNIST),转换后可直接使用read_data_sets()完成读取,这样模型代码的变动就不会很大。本文是对在阅览完代码后所做的注释。
1 import numpy, imageio, glob, sys, os, random 2 #Imageio 提供简单的用于读写图像数据的接口 3 #glob 功能类似于文件搜索,查找文件只用到三个匹配符:”*”, “?”, “[]”。”*”匹配0个或多个字符;”?”匹配单个字符;”[]”匹配指定范围内的字符,如:[0-9]匹配数字。 4 def get_labels_and_files(folder, number): 5 # Make a list of lists of files for each label 6 filelists = [] 7 for label in range(0,10): 8 filelist = [] 9 filelists.append(filelist); 10 dirname = os.path.join(folder, chr(ord('A') + label)) 11 #label实际为0-9,chr(ord('A') + label)返回A-J 12 #拼接路径dirname=folder/[A-J] 13 for file in os.listdir(dirname): 14 #返回一个装满当前路径中文件名的list 15 if (file.endswith('.png')): 16 fullname = os.path.join(dirname, file) 17 if (os.path.getsize(fullname) > 0): 18 filelist.append(fullname) 19 else: 20 print('file ' + fullname + ' is empty') 21 # sort each list of files so they start off in the same order 22 # regardless of how the order the OS returns them in 23 filelist.sort() 24 25 # Take the specified number of items for each label and 26 # build them into an array of (label, filename) pairs 27 # Since we seeded the RNG, we should get the same sample each run 28 labelsAndFiles = [] 29 for label in range(0,10): 30 filelist = random.sample(filelists[label], number) 31 #随机采样 设定个数的文件名 32 for filename in filelist: 33 labelsAndFiles.append((label, filename)) 34 #Python的元组与列表类似,不同之处在于元组的元素不能修改。元组使用小括号,列表使用方括号。 35 return labelsAndFiles 36 37 def make_arrays(labelsAndFiles): 38 images = [] 39 labels = [] 40 for i in range(0, len(labelsAndFiles)): 41 42 # display progress, since this can take a while 43 if (i % 100 == 0): 44 sys.stdout.write("\r%d%% complete" % ((i * 100)/len(labelsAndFiles))) 45 #\r 返回第一个指针,覆盖前面的内容 46 sys.stdout.flush() 47 48 filename = labelsAndFiles[i][1] 49 try: 50 image = imageio.imread(filename) 51 images.append(image) 52 labels.append(labelsAndFiles[i][0]) 53 except: 54 # If this happens we won't have the requested number 55 print("\nCan't read image file " + filename) 56 57 count = len(images) 58 imagedata = numpy.zeros((count,28,28), dtype=numpy.uint8) 59 labeldata = numpy.zeros(count, dtype=numpy.uint8) 60 for i in range(0, len(labelsAndFiles)): 61 imagedata[i] = images[i] 62 labeldata[i] = labels[i] 63 print("\n") 64 return imagedata, labeldata 65 66 def write_labeldata(labeldata, outputfile): 67 header = numpy.array([0x0801, len(labeldata)], dtype='>i4') 68 with open(outputfile, "wb") as f: 69 #以二进制写模式打开 70 #这里使用了 with 语句,不管在处理文件过程中是否发生异常,都能保证 with 语句执行完毕后已经关闭了打开的文件句柄 71 f.write(header.tobytes()) 72 #写入二进制数 73 f.write(labeldata.tobytes()) 74 75 def write_imagedata(imagedata, outputfile): 76 header = numpy.array([0x0803, len(imagedata), 28, 28], dtype='>i4') 77 with open(outputfile, "wb") as f: 78 f.write(header.tobytes()) 79 f.write(imagedata.tobytes()) 80 81 82 83 def main(argv): 84 # Uncomment the line below if you want to seed the random 85 # number generator in the same way I did to produce the 86 # specific data files in this repo. 87 # random.seed(int("notMNIST", 36)) 88 #当我们设置相同的seed,每次生成的随机数相同。如果不设置seed,则每次会生成不同的随机数 89 90 labelsAndFiles = get_labels_and_files(argv[1], int(argv[2])) 91 #随机排序 92 random.shuffle(labelsAndFiles) 93 94 imagedata, labeldata = make_arrays(labelsAndFiles) 95 write_labeldata(labeldata, argv[3]) 96 write_imagedata(imagedata, argv[4]) 97 98 if __name__=='__main__': 99 #Make a script both importable and executable 100 #如果我们是直接执行某个.py文件的时候,该文件中那么”__name__ == '__main__'“是True 101 #如果被别的模块import,__name__!='__main__',这样main()就不会执行 102 103 main(sys.argv)
使用方法
下载解压notMNIST:
curl -o notMNIST_small.tar.gz http://yaroslavvb.com/upload/notMNIST/notMNIST_small.tar.gz curl -o notMNIST_large.tar.gz http://yaroslavvb.com/upload/notMNIST/notMNIST_large.tar.gz tar xzf notMNIST_small.tar.gz tar xzf notMNIST_large.tar.gz
运行转换代码:
python convert_to_mnist_format.py notMNIST_small 1000 data/t10k-labels-idx1-ubyte data/t10k-images-idx3-ubyte python convert_to_mnist_format.py notMNIST_large 6000 data/train-labels-idx1-ubyte data/train-images-idx3-ubyte gzip data/*ubyte