Loading

Python实现朴素贝叶斯识别MNIST手写图片

只写了0,1,2的

import datetime
import math

import numpy as np
import os
from PIL import Image


def get_path(path_root, data):
    temp_dir = os.listdir(path_root)
    for i in temp_dir:
        if os.path.isdir(path_root + "/" + i):
            get_path(path_root + "/" + i, data)
        else:
            if len(data[int(i[0:1])]) >= 5000:
                continue
            img = Image.open(path_root + "/" + i)
            data_vector = list(np.array(img).flatten())
            data_vector.append(int(i[0:1]))
            data[int(i[0:1])].append(data_vector)


def data_transformer():
    # 图片集 1 2 3
    temp = [[], [], []]
    get_path('./pic', temp)
    data = np.array(temp)

    data_train = data[:, 0:3000, :]
    data_test = data[:, 3000:, :]

    print(data.shape)
    data = data.reshape([data.shape[0] * data.shape[1], 785])
    print(data.shape)
    np.savetxt('data.txt', data, fmt='%d')
    return data_train, data_test


class NaiveBayesClassifier:
    # 训练集
    np_data_train = any
    # 测试集
    np_data_test = any
    # 查表法训练结果数据,整体概率信息
    np_training_data = any
    # 懒惰学习数据记录,记录像素在各个位置出现的次数,像素值非0为1,大小   [3][28*28][2]
    np_data_record = any

    def __init__(self, data_train, data_test):
        self.np_data_test = data_test
        self.np_data_train = data_train
        self.np_data_record = np.array(
            [[[0 for _ in range(0, 2)] for _ in range(0, 28 * 28)] for _ in range(0, len(data_train))])
        # 懒惰学习记录数据
        self.lazy_record()

    # 懒惰学习对传入的数据进行记录,并不进行计算
    def lazy_record(self):
        for i in range(0, len(self.np_data_train)):
            for j in range(0, len(self.np_data_train[i])):
                for k in range(0, 28 * 28):
                    self.np_data_record[i][k][1 if data_train[i][j][k] > 0 else 0] = 1 + self.np_data_record[i][k][
                        1 if self.np_data_train[i][j][k] > 0 else 0]

    # 查表法训练
    def table_train(self):
        # 记录像素信息,非0为1
        rs = [[[0 for _ in range(0, 2)] for _ in range(0, 28 * 28)] for _ in range(0, 3)]
        # 图片集结果为I的
        for i in range(0, 3):
            temp1 = self.np_data_train[i]
            # 第 J 张图片
            for j in range(0, 3000):
                temp2 = temp1[j]
                # 的每个像素在K位置出现的次数和
                for k in range(0, 28 * 28):
                    rs[i][k][1 if temp2[k] > 0 else 0] = 1 + rs[i][k][1 if temp2[k] > 0 else 0]
        p = [[[0.0 for _ in range(0, 3)] for _ in range(0, 2)] for _ in range(0, 28 * 28)]
        np_rs = np.array(rs)
        np_rs = np_rs + 1
        # i位置像素
        for i in range(0, 28 * 28):
            # 为j
            for j in range(0, 2):
                # 且图片为k的概率
                for k in range(0, 3):
                    p[i][j][k] = np_rs[k][i][j] * 1.0 / 3000
        self.np_training_data = np.array(p)

    # 查表法根据已有训练结果分类
    def table_classify(self, wait_test_img):
        # 识别为k的概率
        target_rs = [0.0, 0.0, 0.0]
        for k in range(0, 3):
            for m in range(0, 28 * 28):
                target_rs[k] = target_rs[k] + math.log(self.np_training_data[m][1 if wait_test_img[m] > 0 else 0][k])
        # 返回识别结果
        return target_rs.index(max(target_rs))

    # 懒惰学习对传入的数据根据已有记录开始进行估值
    def lazy_classify(self, wait_test_img):
        p = [[[0.0 for _ in range(0, len(self.np_data_train))] for _ in range(0, 2)] for _ in range(0, 28 * 28)]
        np_rs = self.np_data_record.copy()
        np_rs = np_rs + 1
        # i位置像素
        for i in range(0, 28 * 28):
            # 为j
            for j in range(0, 2):
                # 且图片为k的概率
                for k in range(0, 3):
                    p[i][j][k] = np_rs[k][i][j] * 1.0 / len(self.np_data_record[k])

        np_p = np.array(p)

        # 识别为k的概率
        target_rs = [0.0, 0.0, 0.0]
        for k in range(0, len(data_train)):
            for m in range(0, 28 * 28):
                target_rs[k] = target_rs[k] + math.log(np_p[m][1 if wait_test_img[m] > 0 else 0][k])
        return target_rs.index(max(target_rs))

    # 查表识别测试集
    def table_test(self):
        # 识别结果数组
        re_rs_array = [[0 for _ in range(0, len(data_test[i]))] for i in range(0, len(data_test))]
        # 实际为i
        for i in range(0, len(data_test)):
            # 的第j张图片
            for j in range(0, len(data_test[i])):
                re_rs_array[i][j] = self.table_classify(data_test[i][j])
        return re_rs_array

    # 懒惰学习识别测试集
    def lazy_test(self):
        # 识别结果数组
        re_rs_array = [[0 for _ in range(0, len(data_test[i]))] for i in range(0, len(data_test))]
        # 实际为i
        for i in range(0, len(data_test)):
            # 的第j张图片
            for j in range(0, len(data_test[i])):
                re_rs_array[i][j] = self.lazy_classify(data_test[i][j])
        return re_rs_array


if __name__ == '__main__':
    data_train, data_test = data_transformer()
    my_nbc = NaiveBayesClassifier(data_train, data_test)
    # 查表记录开始时间
    table_start = datetime.datetime.now()
    # 查表训练
    my_nbc.table_train()
    # 查表对测试集进行分类
    table_rs = my_nbc.table_test()
    # 记录查表结束时间
    table_end = datetime.datetime.now()
    print('查表计算时间:', (table_end - table_start).total_seconds())

    for i in range(0, len(table_rs)):
        temp_table_np_ti = np.array(table_rs[i])
        print('查表实际%d测试结果正确' % i, len(temp_table_np_ti[temp_table_np_ti == i]))

    print('--------------------------------------')

    # 记录懒惰学习开始时间
    lazy_start = datetime.datetime.now()
    # 懒惰学习对测试集进行分类
    lazy_rs = my_nbc.lazy_test()
    # 记录懒惰学习结束时间
    lazy_end = datetime.datetime.now()
    print('懒惰学习计算时间:', (lazy_end - lazy_start).total_seconds())
    for i in range(0, len(lazy_rs)):
        temp_lazy_np_ti = np.array(lazy_rs[i])
        print('懒惰学习实际%d测试结果正确' % i, len(temp_lazy_np_ti[temp_lazy_np_ti == i]))

原文https://www.blog.hiyj.cn/article/detail/103
图片资源https://www.blog.hiyj.cn/article/detail/4

posted @ 2021-06-21 10:30  WindSnowLi  阅读(24)  评论(0编辑  收藏  举报