数据挖掘本科作业:朴素贝叶斯好评预测
数据集来源:教师
数据集已经分割测试集和训练集合,已语义分割
以下是代码
#python3
#hollytan@126.com
#hollytan@126.com
def loadTest(testPath):
'''加载测试集'''
with open(testPath,encoding = "UTF-8") as fp:
for line in fp.readlines():
test.append(line)
'''加载测试集'''
with open(testPath,encoding = "UTF-8") as fp:
for line in fp.readlines():
test.append(line)
def loadFile(modelPath):
'''加载模型'''
'''模型文件为属性-类别计数'''
with open(modelPath,encoding = "UTF-8") as fp:
for line in fp.readlines():
dataset.append(line)
'''加载模型'''
'''模型文件为属性-类别计数'''
with open(modelPath,encoding = "UTF-8") as fp:
for line in fp.readlines():
dataset.append(line)
def lineSplit(line,pro,classNumList):
'''生成概率表'''
L1 = line.split(" ")
num = int(L1[1])
L2 = L1[0].split(" ")
c = L2[1]
m = L2[2]
if m not in pro.keys() :
pro[m] = {}
if c == c1 :
pro[m][c1] = float(num/classNumList[1])
elif c == c2 :
pro[m][c2] = float(num/classNumList[2])
else :
print("error to classF")
'''生成概率表'''
L1 = line.split(" ")
num = int(L1[1])
L2 = L1[0].split(" ")
c = L2[1]
m = L2[2]
if m not in pro.keys() :
pro[m] = {}
if c == c1 :
pro[m][c1] = float(num/classNumList[1])
elif c == c2 :
pro[m][c2] = float(num/classNumList[2])
else :
print("error to classF")
def classfier(test,pro):
'''测试集检测'''
#未分类样本
notClassfSample_list = []
#正确错误计数
t = f = 0
#判断四联表
ht = hf = ct = cf = 0
#遍历每一个样本,为其预测
for sample in test :
aP = 1
bP = 1
'''测试集检测'''
#未分类样本
notClassfSample_list = []
#正确错误计数
t = f = 0
#判断四联表
ht = hf = ct = cf = 0
#遍历每一个样本,为其预测
for sample in test :
aP = 1
bP = 1
line = replaceNoChinese(sample)
L1 = line.split(' ')
L = [x for x in L1 if x!='']
C = L[0]
L2 = L[1:]
for m in L2 :
if m in pro.keys() :
cpa = pro[m][c1]
cpb = pro[m][c2]
aP *= cpa
bP *= cpb
if aP > bP :
if c1 == C :
t += 1
ht += 1
else:
f += 1
hf +=1
elif aP < bP :
if c2 == C :
t+=1
ct += 1
else:
f+=1
cf += 1
else:
notClassfSample_list.append(line)
print(t /(t+f))
print(str(ht)+"|"+str(hf))
print("-------")
print(str(ct)+"|"+str(cf))
return notClassfSample_list)
L1 = line.split(' ')
L = [x for x in L1 if x!='']
C = L[0]
L2 = L[1:]
for m in L2 :
if m in pro.keys() :
cpa = pro[m][c1]
cpb = pro[m][c2]
aP *= cpa
bP *= cpb
if aP > bP :
if c1 == C :
t += 1
ht += 1
else:
f += 1
hf +=1
elif aP < bP :
if c2 == C :
t+=1
ct += 1
else:
f+=1
cf += 1
else:
notClassfSample_list.append(line)
print(t /(t+f))
print(str(ht)+"|"+str(hf))
print("-------")
print(str(ct)+"|"+str(cf))
return notClassfSample_list)
def dataPro2(pro,a):
'''数据归约和数据缺失值补全'''
for key,d1 in pro.items():
num = 0
if len(d1) < 2:
if c1 not in d1.keys():
d1[c1] = a
d1[c2] = 1
elif c2 not in d1.keys():
d1[c1] = 1
d1[c2] = a
else:
for value2 in d1.values():
num += value2
for key2,v in d1.items():
d1[key2] = v/num
'''数据归约和数据缺失值补全'''
for key,d1 in pro.items():
num = 0
if len(d1) < 2:
if c1 not in d1.keys():
d1[c1] = a
d1[c2] = 1
elif c2 not in d1.keys():
d1[c1] = 1
d1[c2] = a
else:
for value2 in d1.values():
num += value2
for key2,v in d1.items():
d1[key2] = v/num
def replaceNoChinese(text):
pattern = re.compile(r'[^\u4e00-\u9fa5]')
chinese = re.sub(pattern,' ',text)
return chinese
#main
import os
import re
c1 = "好评"
c2 = "差评"
dataset = []
test = []
pro1 = {}
notlist1 = []
pattern = re.compile(r'[^\u4e00-\u9fa5]')
chinese = re.sub(pattern,' ',text)
return chinese
#main
import os
import re
c1 = "好评"
c2 = "差评"
dataset = []
test = []
pro1 = {}
notlist1 = []
loadFile()
pro(pro1)
loadTest()
pro(pro1)
loadTest()
dataPro2(pro1,0.001)
notlist1=classfier(test,pro1)
notlist1=classfier(test,pro1)
结果:
0.8985
807|10
-------
990|193
0
思考:关于数据处理。。训练集某些属性的特定类别值丢失,用传统的删除元组或者平均值补全不好,因为对于好评缺失值来说,很有可能是差评的绝对评价值(例如,怒气冲天-差评)
因此选用预制参数补全,我的方法是采用a(<1)替代空缺的值,在测试集中,该参数越小,测试效果越好。也有过拟合的风险,需要以后仔细研究