互联网时代的社会语言学:基于SNS的文本数据挖掘(python)
# -*- coding=utf-8 -*- import feedparser import re import collections import math import sys reload(sys) sys.setdefaultencoding("utf8") def info_entropy(words): result = 0 total = sum([val for _, val in words.iteritems()]) for word, cnt in words.iteritems(): p = float(cnt) / total result -= p * math.log(p) return result max_word_len = 5 #字个数阈值 entropy_threshold = 1 #商阈值 num_threshold = 3; #词个数阈值 con_threshold = 100; #粘贴度阈值,越大,则是词的概率越大 txtin = u'luxunzawen.txt' txtout = u'luxunzawen_out.txt' f = open(txtin,'r').read().decode('utf-8') bef_sentences = [] bef_sentences = re.split("\W+|[a-zA-Z0-9]+|\s+|\n+", f, 0, re.UNICODE) sentences = [one for one in bef_sentences if len(one)>1] print r"sentences:finish" freq = collections.Counter() #计数器容器 for sentence in sentences: if sentence: l = len(sentence) wl = min(l, max_word_len) for i in xrange(1, wl + 1): for j in xrange(0, l - i + 1): freq[sentence[j:j + i]] += 1 total = sum([val for _, val in freq.iteritems()]) ps = collections.defaultdict(int) for word, val in freq.iteritems(): ps[word] = float(val) / total #每个词的词频 words = set() for word, word_p in ps.items(): if len(word) > 1: p = 0 for i in xrange(1, len(word)): t = ps[word[0:i]] * ps[word[i:]] p = max(p, t) if freq[word] >= num_threshold and word_p / p > con_threshold: #词频大于3,粘合度越大越好 words.add(word) final_words = set() for word in words: lf = rf = True left_words = collections.Counter() right_words = collections.Counter() pattern = re.compile(word.join(['.?', '.?'])) for sentence in sentences: l = pattern.findall(sentence) flag_l = flag_r = False if l: if l[0][0] != word[0]: left_words[l[0][0]] += 1 flag_l = True else: lf = False or flag_l if l[0][-1] != word[-1]: right_words[l[0][-1]] += 1 #有右汉字 flag_r = True else: rf = False or flag_r left_info_entropy = info_entropy(left_words) right_info_entropy = info_entropy(right_words) if lf and len(left_words) > 0 and left_info_entropy < entropy_threshold: continue if rf and len(right_words) > 0 and right_info_entropy < entropy_threshold: continue final_words.add(word) words_list = list(final_words) words_list.sort(cmp = lambda x, y: cmp(freq[y], freq[x])) #for word in words_list: # print word.encode('gb2312'), freq[word] fout = open(txtout, 'w') for word in words_list: result = word+u" "+str(freq[word])+"\n" fout.write(result) print('Done')
C#版本间这里