机器学习 — 文档过滤
分类
分类方法:
- 朴素贝叶斯分类法
- 费舍尔分类法
import re
import math
# 分词
def getwords(doc):
splitter = re.compile('\\W*')
# 根据非字母字符拆分单词,并转化为小写
words = [s.lower() for s in splitter.split(doc) if len(s) > 2 and len(s) < 20]
return dict([(word, 1) for word in words])
class classifier:
def __init__(self, getfeatures, filename=None):
# 统计特征/分类组合的数量
self.featurecatacount = {}
# 每个分类中文档个数
self.catagorycount = {}
self.getfeatures = getfeatures
# featurecatacount
def incfeaturecount(self, feature, catagory):
self.featurecatacount.setdefault(feature, {})
self.featurecatacount[feature].setdefault(catagory, 0)
self.featurecatacount[feature][catagory] += 1
# 增加catagorycount
def inccatacount(self, catagory):
self.catagorycount.setdefault(catagory, 0)
self.catagorycount[catagory] += 1
# 某一特征出现在某一分类中的次数
def featurecount(self, feature, catagory):
if feature in self.featurecatacount and catagory in self.featurecatacount[feature]:
return float(self.featurecatacount[feature][catagory])
return 0.0
# 属于某一分类项的内容数量
def catacount(self, catagory):
if catagory in self.catagorycount:
return float(self.catagorycount[catagory])
return 0.0
# 所有内容项的数量
def totalcount(self):
return sum(self.catagorycount.values())
# 所有分类列表
def catagories(self):
return self.catagorycount.keys()
# 训练分类器
def train(self, item, catagory):
features = self.getfeatures(item)
# 增加该分类catagory下的feature值
for f in features:
self.incfeaturecount(f, catagory)
# 增加分类的计数值
self.inccatacount(catagory)
# 计算单词在分类中出现的概率
def featureprob(self, feature, catagory):
count = self.catacount(catagory)
if count == 0:
return 0
# 特征在分类中出现的总次数,除以分类中包含内容项的总数
return self.featurecount(feature, catagory) / count
def weightedprob(self, feature, catagory, prf, weight=1, ap=0.5):
"""
计算加权平均的概率
Args:
feature:特征
catagory:分类
weight:权重
ap:概率推荐的初始值
"""
# 计算当前的概率值
basicprob = prf(feature, catagory)
# 统计特征在所有分类中出现的次数
totals = sum([self.featurecount(feature, c) for c in self.catagories()])
# 计算加权平均
bp = (weight * ap + totals * basicprob) / (weight + totals)
return bp
class naivebayes(classifier):
"""
朴素贝叶斯分类器:假设被组合的各个概率是彼此独立的,那么总的概率就是各个概率的乘积
"""
def __init__(self, getfeatures):
classifier.__init__(self, getfeatures)
self.thresholds = {}
def setthreshold(self, cat, threshold):
self.thresholds[cat] = threshold
def getthrreshold(self, cat):
if cat not in self.thresholds:
return 1.0
return self.thresholds[cat]
def docprob(self, item, catagory):
features = self.getfeatures(item)
# 将所有特征的概率相乘
p = 1
for f in features:
p *= self.weightedprob(f, catagory, self.featureprob)
return p
def prob(self, item, catagory):
"""
贝叶斯定理:计算条件调换之后的概率,P(A|B) = P(B|A) * P(A) / P(B)
"""
catprob = self.catacount(catagory) / self.totalcount()
docprob = self.docprob(item, catagory)
return catprob * docprob
def classify(self, item, default=None):
probs = {}
# 寻找最大概率的分类
max = 0.0
for cat in self.catagories():
probs[cat] = self.prob(item, cat)
if probs[cat] > max:
max = probs[cat]
best = cat
# 确保概率超出阈值*最大概率
for cat in probs:
if cat == best:
continue
if probs[cat] * self.getthrreshold(best) > probs[best]:
return default
return best
class fisherclassifier(classifier):
"""
费舍尔分类器
"""
def __init__(self, getfeatures):
classifier.__init__(self, getfeatures)
self.minimums = {}
def setminimums(self, cat, minimums):
self.minimums[cat] = minimums
def getminimums(self, cat):
if cat not in self.minimums:
return 0
return self.minimums[cat]
def cprob(self, f, cat):
# 特征在该分类中出现的频率
clf = self.featureprob(f, cat)
# 特征在所有分类中出现的频率
freqsum = sum([self.featureprob(f, c) for c in self.catagories()])
# 概率等于特征在该分类中出现的频率除以总的频率
p = clf / freqsum
return p
def fisherprob(self, item, cat):
# 将所有概率值相乘
p = 1
features = self.getfeatures(item)
for f in features:
p *= self.weightedprob(f, cat, self.cprob)
# 取自然对数,并乘以-2
fscore = -2 * math.log(p)
# 利用倒置对数卡方函数求得概率
return self.invchi2(fscore, len(features) * 2)
def invchi2(self, chi, df):
m = chi / 2.0
summ = term = math.exp(-m)
for i in range(1, df // 2):
term *= m / i
summ += term
return min(summ, 1.0)
def classify(self, item, default=None):
# 循环遍历并寻找最佳结果
best = default
max = 0.0
for c in self.catagories():
p = self.fisherprob(item, c)
# 确保其超过下限值
if p > self.getminimums(c) and p > max:
best = c
max = p
return best
def sampletrain(cl):
cl.train('Nobody owns the water.','good')
cl.train('the quick rabbit jumps fences','good')
cl.train('buy pharmaceuticals now','bad')
cl.train('make quick money at the online casino','bad')
cl.train('the quick brown fox jumps','good')
cl = classifier(getwords)
cl.train('the quick brown fox jumps over the lazy dog', 'good')
cl.train('make quick money in the online cassino', 'bad')
print cl.featurecount('quick', 'good')
print cl.featurecount('quick' ,'bad')
sampletrain(cl)
print cl.featureprob('quick', 'good')
print cl.weightedprob('quick', 'good', cl.featureprob)
1.0
1.0
0.75
0.708333333333
# 贝叶斯分类法测试
bayes = naivebayes(getwords)
sampletrain(bayes)
print bayes.prob('quick rabbit', 'good')
print bayes.prob('quick rabbit', 'bad')
print bayes.classify('quick rabbit', 'unknown')
print bayes.classify('quick money', 'unknown')
bayes.setthreshold('bad', 3.0)
print bayes.classify('quick money', 'unknown')
for i in range(10):
sampletrain(bayes)
print bayes.classify('quick money', 'unknown')
0.15625
0.05
good
bad
unknown
bad
# 费舍尔分类法测试
fisher = fisherclassifier(getwords)
sampletrain(fisher)
fisher.cprob('quick', 'good')
print fisher.fisherprob('quick rabbit', 'good')
print fisher.fisherprob('quick rabbit', 'bad')
print fisher.classify('quick rabbit')
print fisher.classify('quick money')
fisher.setminimums('bad', 0.8)
print fisher.classify('quick money')
fisher.setminimums('bad', 0.4)
print fisher.classify('quick money')
0.78013986589
0.356335962833
good
bad
good
bad