结巴分词
#!coding: utf-8 import sys import jieba import jieba.posseg as pseg import jieba.analyse as anal from optparse import OptionParser usage = "usage: python %prog [--tfidf topK] [--textr topK]"; parser = OptionParser(usage); parser.add_option("--tag", dest="tag", action="store_true"); parser.add_option("--fast", dest="fast", action="store_true"); parser.add_option("--tfidf", dest="tfidf"); parser.add_option("--textr", dest="textr"); parser.add_option("--stopdict", dest="stopdict"); opt,args = parser.parse_args(); def wordFilter(wordlist): if opt.stopdict: with open(opt.stopdict,"r") as f: stopList = f.read().strip().split("\n"); else: print "please special stopword file path"; returnlist = []; for word in wordlist: if word: word = word.encode("utf-8"); if word not in stopList: returnlist.append(word); return returnlist; def wordPosFilter(wordlist): if opt.stopdict: with open(opt.stopdict,"r") as f: stopList = f.read().strip().split("\n"); else: stopList = []; returnlist = []; save_post = ["an","n","nr","ns","nt","nz","v","vd","eng","ni"]; for w in wordlist: word = w.word.encode("utf-8"); pos = w.flag; if word not in stopList and pos in save_post: returnlist.append(word); return returnlist; txt = "支持三种分词模式: 精确模式,试图将句子最精确地切开,适合文本分析; \ 全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义; \ 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。 \ 支持繁体分词 支持自定义词典 MIT 授权协议 在线演示"; #multiprocess if opt.fast: jieba.enable_parallel(10); #define word-dict jieba.add_word("全模式"); jieba.suggest_freq(("协","议"), True) ; #jieba.load_userdict(dictfilepath); #generator #print "/".join(jieba.cut(txt)); with open("dict/stopword.txt") as f: stoplist = f.read().strip().split("\n"); rest = jieba.lcut(txt); print "/".join(rest); print "========================= filter ==========================="; rest = wordFilter(rest); print "/".join(rest); psss = pseg.lcut(txt); print "========================= posFilter ==========================="; psss = wordPosFilter(psss); print "/".join(psss); sys.exit(); #list print "/".join(jieba.lcut(txt)); #search mode print "/".join(jieba.cut_for_search(txt)); #get word's position res = jieba.tokenize(txt.decode("utf-8")); #res = jieba.tokenize(txt.decode("utf-8"), mode="search"); #search mode print "word\t\tstart\t\tend"; for tk in res: print("%s\t\t %d \t\t %d" % (tk[0],tk[1],tk[2])); #tagging word if opt.tag: for w,k in pseg.cut(txt): print w+"("+k+")", #tfidf sort keyword if opt.tfidf: topK = int(opt.tfidf); tags = anal.extract_tags(txt, topK, withWeight=True); for word,weight in tags: print word,weight #textrank sort keyword if opt.textr: topk = int(opt.textr); tags = anal.textrank(txt, topk, withWeight=True); for word,weight in tags: print word,weight;
程序员最高境界:静若瘫痪,动若癫痫