jieba 分词

西游记相关的分词,出现次数最高的20个

输入:

 1 import jieba
 2 excludes = {"一个", "我们", "怎么", "那里", "不知", "不是", "只见", "两个", "不敢", "这个", "如何", "原来", "甚么", "不曾", "闻言", "正是", "那怪", "一声"}
 3 txt = open("西游记1.txt", "r", encoding='UTF-8').read()
 4 words = jieba.lcut(txt)
 5 jieba.add_word("孙悟空")
 6 jieba.add_word("金公")
 7 jieba.add_word("孙行者")
 8 jieba.add_word("心猿")
 9 jieba.add_word("齐天大圣")
10 jieba.add_word("斗战胜佛")
11 jieba.add_word("美猴王")
12 jieba.add_word("孙行者")
13 jieba.add_word("三藏法师")
14 jieba.add_word("玄奘")
15 jieba.add_word("金蝉子")
16 jieba.add_word("江流儿")
17 jieba.add_word("御弟")
18 jieba.add_word("沙僧")
19 jieba.add_word("沙和尚")
20 jieba.add_word("沙悟净")
21 jieba.add_word("刀圭")
22 jieba.add_word("黄婆")
23 jieba.add_word("悟能")
24 jieba.add_word("猪悟能")
25 jieba.add_word("猪刚鬣")
26 jieba.add_word("木母")
27 jieba.add_word("白龙马")
28 jieba.add_word("天龙马")
29 jieba.add_word("玉龙三太子")
30 jieba.add_word("八部天龙广力菩萨")
31 counts = {}
32 for word in words:
33     if len(word) == 1:
34         continue
35     elif word == "师父" or word == "三藏" or word == "玄奘" or word == "三藏法师" or word == "金蝉子" or word == "江流儿" or word == "御弟":
36         rword = "唐僧"
37     elif word == "大圣" or word == "老孙" or word == "孙悟空" or word == "美猴王" or word == "孙行者" or word == "齐天大圣" or word == "斗战胜佛" or word == "金公" or word == "心猿":
38         rword = "悟空"
39     elif word == "悟能" or word == "八戒" or word == "猪悟能" or word == "呆子" or word == "木母" or word == "猪刚鬣":
40         rword = "猪八戒"
41     elif word == "沙僧" or word == "沙悟净" or word == "沙和尚" or word == "刀圭" or word == "黄婆":
42         rword = "悟净"
43     elif word == "天龙马" or word == "玉龙三太子" or word == "八部天龙广力菩萨":
44         rword = "白龙马"
45     else:
46         rword = word
47     counts[rword] = counts.get(rword, 0) + 1
48 for word in excludes:
49     del(counts[word])
50 items = list(counts.items())
51 items.sort(key=lambda x:x[1], reverse=True)
52 for i in range(20):
53     word, count = items[i]
54     print("{0:<10}{1:>5}".format(word, count))

输出:

 

posted @ 2023-12-19 16:03  尘雯时  阅读(19)  评论(0编辑  收藏  举报