机器学习 — 文档过滤

分类

分类方法:

  1. 朴素贝叶斯分类法
  2. 费舍尔分类法
import re
import math

# 分词
def getwords(doc):
    splitter = re.compile('\\W*')
    # 根据非字母字符拆分单词,并转化为小写
    words = [s.lower() for s in splitter.split(doc) if len(s) > 2 and len(s) < 20]
    
    return dict([(word, 1) for word in words])

class classifier:
    def __init__(self, getfeatures, filename=None):
        # 统计特征/分类组合的数量
        self.featurecatacount = {}
        # 每个分类中文档个数
        self.catagorycount = {}
        self.getfeatures = getfeatures
    
    # featurecatacount
    def incfeaturecount(self, feature, catagory):
        self.featurecatacount.setdefault(feature, {})
        self.featurecatacount[feature].setdefault(catagory, 0)
        self.featurecatacount[feature][catagory] += 1
    
    # 增加catagorycount
    def inccatacount(self, catagory):
        self.catagorycount.setdefault(catagory, 0)
        self.catagorycount[catagory] += 1
    
    # 某一特征出现在某一分类中的次数
    def featurecount(self, feature, catagory):
        if feature in self.featurecatacount and catagory in self.featurecatacount[feature]:
            return float(self.featurecatacount[feature][catagory])
        return 0.0
    
    # 属于某一分类项的内容数量
    def catacount(self, catagory):
        if catagory in self.catagorycount:
            return float(self.catagorycount[catagory])
        return 0.0
    
    # 所有内容项的数量
    def totalcount(self):
        return sum(self.catagorycount.values())
    
    # 所有分类列表
    def catagories(self):
        return self.catagorycount.keys()
    
    # 训练分类器
    def train(self, item, catagory):
        features = self.getfeatures(item)
        # 增加该分类catagory下的feature值
        for f in features:
            self.incfeaturecount(f, catagory)
        
        # 增加分类的计数值
        self.inccatacount(catagory)
    
    # 计算单词在分类中出现的概率
    def featureprob(self, feature, catagory):
        count = self.catacount(catagory)
        if count == 0:
            return 0
        
        # 特征在分类中出现的总次数,除以分类中包含内容项的总数
        return self.featurecount(feature, catagory) / count
    
    def weightedprob(self, feature, catagory, prf,  weight=1, ap=0.5):
        """
        计算加权平均的概率
        
        Args:
            feature:特征
            catagory:分类
            weight:权重
            ap:概率推荐的初始值
        """
        
        # 计算当前的概率值
        basicprob = prf(feature, catagory)
        
        # 统计特征在所有分类中出现的次数
        totals = sum([self.featurecount(feature, c) for c in self.catagories()])
        
        # 计算加权平均
        bp = (weight * ap + totals * basicprob) / (weight + totals)
        return bp
    
class naivebayes(classifier):
    """
    朴素贝叶斯分类器:假设被组合的各个概率是彼此独立的,那么总的概率就是各个概率的乘积
    """
    def __init__(self, getfeatures):
        classifier.__init__(self, getfeatures)
        self.thresholds = {}
        
    def setthreshold(self, cat, threshold):
        self.thresholds[cat] = threshold
    
    def getthrreshold(self, cat):
        if cat not in self.thresholds:
            return 1.0
        return self.thresholds[cat]
    
    def docprob(self, item, catagory):
        features = self.getfeatures(item)
        
        # 将所有特征的概率相乘
        p = 1
        for f in features:
            p *= self.weightedprob(f, catagory, self.featureprob)
        return p
    
    def prob(self, item, catagory):
        """
        贝叶斯定理:计算条件调换之后的概率,P(A|B) = P(B|A) * P(A) / P(B)
        """
        catprob = self.catacount(catagory) / self.totalcount()
        docprob = self.docprob(item, catagory)
        
        return catprob * docprob
    
    def classify(self, item, default=None):
        probs = {}
        # 寻找最大概率的分类
        max = 0.0
        for cat in self.catagories():
            probs[cat] = self.prob(item, cat)
            if probs[cat] > max:
                max = probs[cat]
                best = cat
        # 确保概率超出阈值*最大概率
        for cat in probs:
            if cat == best:
                continue
            if probs[cat] * self.getthrreshold(best) > probs[best]:
                return default
            return best

class fisherclassifier(classifier):
    """
    费舍尔分类器
    
    """
    def __init__(self, getfeatures):
        classifier.__init__(self, getfeatures)
        self.minimums = {}
        
    def setminimums(self, cat, minimums):
        self.minimums[cat] = minimums
        
    def getminimums(self, cat):
        if cat not in self.minimums:
            return 0
        return self.minimums[cat]
            
    def cprob(self, f, cat):
        # 特征在该分类中出现的频率
        clf = self.featureprob(f, cat)
        
        # 特征在所有分类中出现的频率
        freqsum = sum([self.featureprob(f, c) for c in self.catagories()])
        
        # 概率等于特征在该分类中出现的频率除以总的频率
        p = clf / freqsum
        return p
    
    def fisherprob(self, item, cat):
        # 将所有概率值相乘
        p = 1
        features = self.getfeatures(item)
        for f in features:
            p *= self.weightedprob(f, cat, self.cprob)
            
        # 取自然对数,并乘以-2
        fscore = -2 * math.log(p)
        
        # 利用倒置对数卡方函数求得概率
        return self.invchi2(fscore, len(features) * 2)
    
    def invchi2(self, chi, df):
        m = chi / 2.0
        summ = term = math.exp(-m)
        for i in range(1, df // 2):
            term *= m / i
            summ += term
        return min(summ, 1.0)
    
    def classify(self, item, default=None):
        # 循环遍历并寻找最佳结果
        best = default
        max = 0.0
        for c in self.catagories():
            p = self.fisherprob(item, c)
            # 确保其超过下限值
            if p > self.getminimums(c) and p > max:
                best = c
                max = p
        return best
        
        
def sampletrain(cl):
    cl.train('Nobody owns the water.','good')
    cl.train('the quick rabbit jumps fences','good')
    cl.train('buy pharmaceuticals now','bad')
    cl.train('make quick money at the online casino','bad')
    cl.train('the quick brown fox jumps','good')
cl = classifier(getwords)
cl.train('the quick brown fox jumps over the lazy dog', 'good')
cl.train('make quick money in the online cassino', 'bad')
print cl.featurecount('quick', 'good')
print cl.featurecount('quick' ,'bad')
sampletrain(cl)
print cl.featureprob('quick', 'good')
print cl.weightedprob('quick', 'good', cl.featureprob)
1.0
1.0
0.75
0.708333333333
# 贝叶斯分类法测试
bayes = naivebayes(getwords)
sampletrain(bayes)
print bayes.prob('quick rabbit', 'good')
print bayes.prob('quick rabbit', 'bad')

print bayes.classify('quick rabbit', 'unknown')
print bayes.classify('quick money', 'unknown')

bayes.setthreshold('bad', 3.0)
print bayes.classify('quick money', 'unknown')

for i in range(10):
    sampletrain(bayes)
print bayes.classify('quick money', 'unknown')
0.15625
0.05
good
bad
unknown
bad
# 费舍尔分类法测试
fisher = fisherclassifier(getwords)
sampletrain(fisher)
fisher.cprob('quick', 'good')

print fisher.fisherprob('quick rabbit', 'good')
print fisher.fisherprob('quick rabbit', 'bad')
print fisher.classify('quick rabbit')
print fisher.classify('quick money')

fisher.setminimums('bad', 0.8)
print fisher.classify('quick money')

fisher.setminimums('bad', 0.4)
print fisher.classify('quick money')
0.78013986589
0.356335962833
good
bad
good
bad
posted @ 2017-03-28 23:21  lacker  阅读(408)  评论(0编辑  收藏  举报