Python实现朴素贝叶斯识别MNIST手写图片
只写了0,1,2的
import datetime
import math
import numpy as np
import os
from PIL import Image
def get_path(path_root, data):
temp_dir = os.listdir(path_root)
for i in temp_dir:
if os.path.isdir(path_root + "/" + i):
get_path(path_root + "/" + i, data)
else:
if len(data[int(i[0:1])]) >= 5000:
continue
img = Image.open(path_root + "/" + i)
data_vector = list(np.array(img).flatten())
data_vector.append(int(i[0:1]))
data[int(i[0:1])].append(data_vector)
def data_transformer():
# 图片集 1 2 3
temp = [[], [], []]
get_path('./pic', temp)
data = np.array(temp)
data_train = data[:, 0:3000, :]
data_test = data[:, 3000:, :]
print(data.shape)
data = data.reshape([data.shape[0] * data.shape[1], 785])
print(data.shape)
np.savetxt('data.txt', data, fmt='%d')
return data_train, data_test
class NaiveBayesClassifier:
# 训练集
np_data_train = any
# 测试集
np_data_test = any
# 查表法训练结果数据,整体概率信息
np_training_data = any
# 懒惰学习数据记录,记录像素在各个位置出现的次数,像素值非0为1,大小 [3][28*28][2]
np_data_record = any
def __init__(self, data_train, data_test):
self.np_data_test = data_test
self.np_data_train = data_train
self.np_data_record = np.array(
[[[0 for _ in range(0, 2)] for _ in range(0, 28 * 28)] for _ in range(0, len(data_train))])
# 懒惰学习记录数据
self.lazy_record()
# 懒惰学习对传入的数据进行记录,并不进行计算
def lazy_record(self):
for i in range(0, len(self.np_data_train)):
for j in range(0, len(self.np_data_train[i])):
for k in range(0, 28 * 28):
self.np_data_record[i][k][1 if data_train[i][j][k] > 0 else 0] = 1 + self.np_data_record[i][k][
1 if self.np_data_train[i][j][k] > 0 else 0]
# 查表法训练
def table_train(self):
# 记录像素信息,非0为1
rs = [[[0 for _ in range(0, 2)] for _ in range(0, 28 * 28)] for _ in range(0, 3)]
# 图片集结果为I的
for i in range(0, 3):
temp1 = self.np_data_train[i]
# 第 J 张图片
for j in range(0, 3000):
temp2 = temp1[j]
# 的每个像素在K位置出现的次数和
for k in range(0, 28 * 28):
rs[i][k][1 if temp2[k] > 0 else 0] = 1 + rs[i][k][1 if temp2[k] > 0 else 0]
p = [[[0.0 for _ in range(0, 3)] for _ in range(0, 2)] for _ in range(0, 28 * 28)]
np_rs = np.array(rs)
np_rs = np_rs + 1
# i位置像素
for i in range(0, 28 * 28):
# 为j
for j in range(0, 2):
# 且图片为k的概率
for k in range(0, 3):
p[i][j][k] = np_rs[k][i][j] * 1.0 / 3000
self.np_training_data = np.array(p)
# 查表法根据已有训练结果分类
def table_classify(self, wait_test_img):
# 识别为k的概率
target_rs = [0.0, 0.0, 0.0]
for k in range(0, 3):
for m in range(0, 28 * 28):
target_rs[k] = target_rs[k] + math.log(self.np_training_data[m][1 if wait_test_img[m] > 0 else 0][k])
# 返回识别结果
return target_rs.index(max(target_rs))
# 懒惰学习对传入的数据根据已有记录开始进行估值
def lazy_classify(self, wait_test_img):
p = [[[0.0 for _ in range(0, len(self.np_data_train))] for _ in range(0, 2)] for _ in range(0, 28 * 28)]
np_rs = self.np_data_record.copy()
np_rs = np_rs + 1
# i位置像素
for i in range(0, 28 * 28):
# 为j
for j in range(0, 2):
# 且图片为k的概率
for k in range(0, 3):
p[i][j][k] = np_rs[k][i][j] * 1.0 / len(self.np_data_record[k])
np_p = np.array(p)
# 识别为k的概率
target_rs = [0.0, 0.0, 0.0]
for k in range(0, len(data_train)):
for m in range(0, 28 * 28):
target_rs[k] = target_rs[k] + math.log(np_p[m][1 if wait_test_img[m] > 0 else 0][k])
return target_rs.index(max(target_rs))
# 查表识别测试集
def table_test(self):
# 识别结果数组
re_rs_array = [[0 for _ in range(0, len(data_test[i]))] for i in range(0, len(data_test))]
# 实际为i
for i in range(0, len(data_test)):
# 的第j张图片
for j in range(0, len(data_test[i])):
re_rs_array[i][j] = self.table_classify(data_test[i][j])
return re_rs_array
# 懒惰学习识别测试集
def lazy_test(self):
# 识别结果数组
re_rs_array = [[0 for _ in range(0, len(data_test[i]))] for i in range(0, len(data_test))]
# 实际为i
for i in range(0, len(data_test)):
# 的第j张图片
for j in range(0, len(data_test[i])):
re_rs_array[i][j] = self.lazy_classify(data_test[i][j])
return re_rs_array
if __name__ == '__main__':
data_train, data_test = data_transformer()
my_nbc = NaiveBayesClassifier(data_train, data_test)
# 查表记录开始时间
table_start = datetime.datetime.now()
# 查表训练
my_nbc.table_train()
# 查表对测试集进行分类
table_rs = my_nbc.table_test()
# 记录查表结束时间
table_end = datetime.datetime.now()
print('查表计算时间:', (table_end - table_start).total_seconds())
for i in range(0, len(table_rs)):
temp_table_np_ti = np.array(table_rs[i])
print('查表实际%d测试结果正确' % i, len(temp_table_np_ti[temp_table_np_ti == i]))
print('--------------------------------------')
# 记录懒惰学习开始时间
lazy_start = datetime.datetime.now()
# 懒惰学习对测试集进行分类
lazy_rs = my_nbc.lazy_test()
# 记录懒惰学习结束时间
lazy_end = datetime.datetime.now()
print('懒惰学习计算时间:', (lazy_end - lazy_start).total_seconds())
for i in range(0, len(lazy_rs)):
temp_lazy_np_ti = np.array(lazy_rs[i])
print('懒惰学习实际%d测试结果正确' % i, len(temp_lazy_np_ti[temp_lazy_np_ti == i]))
原文:https://www.blog.hiyj.cn/article/detail/103
图片资源:https://www.blog.hiyj.cn/article/detail/4