互联网时代的社会语言学:基于SNS的文本数据挖掘(python)

# -*- coding=utf-8 -*-
import feedparser
import re
import collections
import math
import sys
reload(sys)
sys.setdefaultencoding("utf8") 
 
def info_entropy(words):
    result = 0 
    total = sum([val for _, val in words.iteritems()])
    for word, cnt in words.iteritems():
        p = float(cnt) / total
        result -= p * math.log(p)
    return result
 
max_word_len = 5            #字个数阈值
entropy_threshold = 1        #商阈值
num_threshold = 3;          #词个数阈值
con_threshold = 100;        #粘贴度阈值,越大,则是词的概率越大
txtin = u'luxunzawen.txt'
txtout = u'luxunzawen_out.txt'
f = open(txtin,'r').read().decode('utf-8')

bef_sentences = []
bef_sentences = re.split("\W+|[a-zA-Z0-9]+|\s+|\n+", f, 0, re.UNICODE)
sentences = [one for one in bef_sentences if len(one)>1]
print r"sentences:finish"

freq = collections.Counter()                        #计数器容器
for sentence in sentences:
    if sentence:
        l = len(sentence)
        wl = min(l, max_word_len)
        for i in xrange(1, wl + 1): 
            for j in xrange(0, l - i + 1): 
                freq[sentence[j:j + i]] += 1
                
total = sum([val for _, val in freq.iteritems()])
ps = collections.defaultdict(int)
for word, val in freq.iteritems():
    ps[word] = float(val) / total               #每个词的词频
 
words = set()
for word, word_p in ps.items():
    if len(word) > 1:
        p = 0
        for i in xrange(1, len(word)):
            t = ps[word[0:i]] * ps[word[i:]]
            p = max(p, t)
        if freq[word] >= num_threshold and word_p / p > con_threshold:    #词频大于3,粘合度越大越好
            words.add(word)
 
final_words = set()
for word in words:
    lf = rf = True
    left_words = collections.Counter()        
    right_words = collections.Counter()
    pattern = re.compile(word.join(['.?', '.?']))
    for sentence in sentences:
        l = pattern.findall(sentence)
        flag_l = flag_r = False
        if l:
            if l[0][0] != word[0]:
                left_words[l[0][0]] += 1
                flag_l = True
            else:
                lf = False or flag_l
            if l[0][-1] != word[-1]:
                right_words[l[0][-1]] += 1    #有右汉字
                flag_r = True
            else:
                rf = False or flag_r
                
                
    left_info_entropy = info_entropy(left_words)
    right_info_entropy = info_entropy(right_words)
    
    if  lf and len(left_words) > 0 and left_info_entropy < entropy_threshold:
        continue
    if  rf and len(right_words) > 0 and right_info_entropy < entropy_threshold:
        continue
    final_words.add(word)
    
words_list = list(final_words)
words_list.sort(cmp = lambda x, y: cmp(freq[y], freq[x]))
#for word in words_list:
 #   print word.encode('gb2312'), freq[word]

    
fout = open(txtout, 'w')
for word in words_list:
    result = word+u"   "+str(freq[word])+"\n"
    fout.write(result)
    

print('Done')
View Code

C#版本间这里

posted @ 2013-09-23 09:10  阿黄的苹果  阅读(583)  评论(0编辑  收藏  举报