数据挖掘本科作业：朴素贝叶斯好评预测

数据集来源：教师

数据集已经分割测试集和训练集合，已语义分割

以下是代码

#python3
#hollytan@126.com

def loadTest(testPath):
'''加载测试集'''
    with open(testPath,encoding = "UTF-8") as fp:
        for line in fp.readlines():
            test.append(line)

def loadFile(modelPath):
'''加载模型'''
'''模型文件为属性-类别计数'''
    with open(modelPath,encoding = "UTF-8") as fp:
        for line in fp.readlines():
            dataset.append(line)

def lineSplit(line,pro,classNumList):
    '''生成概率表'''
    L1 = line.split(" ")
    num = int(L1[1])
    L2 = L1[0].split(" ")
    c = L2[1]
    m = L2[2]
    if m not in pro.keys() :
        pro[m] = {}
    if c == c1 :
            pro[m][c1] = float(num/classNumList[1])
    elif c == c2 :
            pro[m][c2] = float(num/classNumList[2])
    else :
            print("error to classF")

def classfier(test,pro):
    '''测试集检测'''
    #未分类样本
    notClassfSample_list = []
    #正确错误计数
    t = f = 0
    #判断四联表
    ht = hf = ct = cf = 0
    #遍历每一个样本，为其预测
    for sample in test :
        aP = 1
        bP = 1

        line = replaceNoChinese(sample)
        L1 = line.split(' ')
        L = [x for x in L1 if x!='']
        C = L[0]
        L2 = L[1:]


        for m in L2 :
            if m in pro.keys() :
                cpa = pro[m][c1]
                cpb = pro[m][c2]
                aP *= cpa
                bP *= cpb
        if aP > bP :
            if c1 == C :
                t += 1
                ht += 1
            else:
                f += 1
                hf +=1
        elif aP < bP :
            if c2 == C :
                t+=1
                ct += 1
            else:
                f+=1
                cf += 1
        else:
            notClassfSample_list.append(line)

    print(t /(t+f))
    print(str(ht)+"|"+str(hf))
    print("-------")
    print(str(ct)+"|"+str(cf))

    return notClassfSample_list)

def dataPro2(pro,a):
    '''数据归约和数据缺失值补全'''
    for key,d1 in pro.items():
        num = 0
        if len(d1) < 2:
            if c1 not in d1.keys():
                d1[c1] = a
                d1[c2] = 1
            elif c2 not in d1.keys():
                d1[c1] = 1
                d1[c2] = a
        else:
            for value2 in d1.values():
                num += value2
            for key2,v in d1.items():
                d1[key2] = v/num

def replaceNoChinese(text):
    pattern = re.compile(r'[^\u4e00-\u9fa5]')
    chinese = re.sub(pattern,' ',text)
    return chinese
#main
import os
import re
c1 = "好评"
c2 = "差评"
dataset = []
test = []
pro1 = {}
notlist1 = []

loadFile()
pro(pro1)
loadTest()

dataPro2(pro1,0.001)
notlist1=classfier(test,pro1)

结果：

0.8985
807|10
-------
990|193
0

思考：关于数据处理。。训练集某些属性的特定类别值丢失，用传统的删除元组或者平均值补全不好，因为对于好评缺失值来说，很有可能是差评的绝对评价值（例如，怒气冲天-差评）

因此选用预制参数补全，我的方法是采用a(<1)替代空缺的值，在测试集中，该参数越小，测试效果越好。也有过拟合的风险，需要以后仔细研究

posted on 2021-01-18 14:09 onlyweast 阅读(193) 评论(0) 收藏举报

刷新页面返回顶部

公告