-
博客代码:180927
-
作业代码:181007
-
贝叶斯概率
![]()
朴素贝叶斯概率计算过程
![]()
邮件的区分方法
![]()
处理的流程
ftrain = open("spam_train.txt")
lines = ftrain.readlines()
S = dict() #初始化a,b字典
H = dict()
tagS = dict() #标记位,使得一个邮件里面不同的词只会记录一次
tagH = dict()
trash = 0 #记录垃圾邮箱的个数
for line in lines: #对a,b以及标记位进行初始化,从0开始计数
data = line.split(' ')
count_d = len(data)
if data[0] == '0':
trash+=1
for i in range(count_d-1):
S[data[i+1]] = 0
tagS[data[i+1]] = 0
H[data[i+1]] = 0
tagH[data[i+1]] = 0
for line in lines: #记录关键词垃圾邮件和正常邮件的个数
for i in range(count_d-1):
tagS[data[i+1]] = 0
tagH[data[i+1]] = 0
data = line.split(' ')
count_d = len(data)
if data[0] == '0':
for i in range(count_d-1):
if tagS[data[i+1]] == 0: #如果没被标记过,才会对其进行加数
S[data[i+1]]+=1
tagS[data[i+1]] = 1
if data[0] == '1':
for i in range(count_d-1):
if tagH[data[i+1]] == 0:
H[data[i+1]]+=1
tagH[data[i+1]] = 1
rig = 0
ftest = open("spam_test.txt")
lines = ftest.readlines()
for line in lines:
ans = trash/(5000-trash)
data = line.split(' ')
count_d = len(data)
for i in range(count_d - 1): #将没有出现在训练邮件的单词置0
S.setdefault(data[i+1],0)
for i in range(count_d - 1):
H.setdefault(data[i+1],0)
for i in range(count_d - 1): #将两个结果进行相除比较大小
ans = ans*(S.get(data[i+1])+1)/(trash + 1)
ans = ans*(5001 - trash)/(H.get(data[i+1])+1)
if ans>1:
if data[0] == '0':
rig+=1
if ans<1:
if data[0] == '1':
rig+=1
print(rig/1000)