1 from sklearn.feature_extraction.text import CountVectorizer
 2 from  sklearn.feature_extraction.text import  TfidfVectorizer
 3 import jieba
 4 
 5 content = ["今天天气明媚,万里无云",
 6            "跟女朋友在中公五方桥基地一起吃完早餐",
 7            "一起溜达着去天安门广场,去看红旗飘飘",
 8            "顺着中山公园出来之后去王府井玩",
 9            "玩完之后一起看少年的你,再吃个晚餐,美滋滋"]
10 
11 # 使用分词工具 对文章进行分词
12 # pip install jieba
13 content_list = []
14 # 使用jieba精确模式进行分词
15 for tmp in content:
16     # print(tmp)
17     seg = jieba.cut(tmp, cut_all=False)
18     # print(list(seg))
19     seg_content = ",".join(seg)
20     # print(seg_content)
21     content_list.append(seg_content)
22 print(content_list)
23 # # #实例化对象
24 # conv = CountVectorizer(stop_words=["溜达","顺着"])
25 # # 统计词数
26 # x = conv.fit_transform(content_list)
27 #
28 # # 获取统计的词
29 # feature_names  = conv.get_feature_names()
30 # res = x.toarray()
31 # print(feature_names)
32 # print("res:\n",res)
33 
34 # 词的重要性统计
35 # #实例化对象
36 tfidf = TfidfVectorizer(stop_words=["溜达","顺着"])
37 # 统计词的重要性程度
38 x = tfidf.fit_transform(content_list)
39 
40 # 获取统计的词
41 feature_names  = tfidf.get_feature_names()
42 res = x.toarray()
43 print(feature_names)
44 print("res:\n",res)