#统计西游记人物出场次数,(去除冠词,代词等干扰)并降序排列p173 import jieba excludes={"一个","那里","怎么","我们","不知","两个","甚么","不是","只见","原来","如何","这个","不曾","不敢","闻言","正是","只是","那怪","出来","一声","真个","不得","这里","今日","那个","取经","却说","如今","三个","这般","就是","不见","铁棒","认得","不能","不要","果然","上前","有些","性命"} txt = open("西游记.txt", "r", encoding="utf-8").read() words = jieba.lcut(txt) counts = {} for word in words: if len(word) == 1: continue elif word=="唐僧" or word=="师父": rword="唐僧" elif word=="三藏" or word=="沙僧": rword="沙僧" elif word=="老孙" or word=="大圣" or word=="悟空" or word=="孙行者" or word=="孙大圣": rword="悟空" # elif word=="孟德" or word=="丞相": # rword="曹操" else: rword=word counts[rword] = counts.get(rword,0) + 1 for word in excludes: del counts[word] items = list(counts.items()) items.sort(key=lambda x:x[1], reverse=True) for i in range(20): word, count = items[i] print("{0:<10}{1:>5}".format(word, count))
运行结果