1 from sklearn.feature_extraction.text import CountVectorizer 2 from sklearn.feature_extraction.text import TfidfVectorizer 3 import jieba 4 5 content = ["今天天气明媚,万里无云", 6 "跟女朋友在中公五方桥基地一起吃完早餐", 7 "一起溜达着去天安门广场,去看红旗飘飘", 8 "顺着中山公园出来之后去王府井玩", 9 "玩完之后一起看少年的你,再吃个晚餐,美滋滋"] 10 11 # 使用分词工具 对文章进行分词 12 # pip install jieba 13 content_list = [] 14 # 使用jieba精确模式进行分词 15 for tmp in content: 16 # print(tmp) 17 seg = jieba.cut(tmp, cut_all=False) 18 # print(list(seg)) 19 seg_content = ",".join(seg) 20 # print(seg_content) 21 content_list.append(seg_content) 22 print(content_list) 23 # # #实例化对象 24 # conv = CountVectorizer(stop_words=["溜达","顺着"]) 25 # # 统计词数 26 # x = conv.fit_transform(content_list) 27 # 28 # # 获取统计的词 29 # feature_names = conv.get_feature_names() 30 # res = x.toarray() 31 # print(feature_names) 32 # print("res:\n",res) 33 34 # 词的重要性统计 35 # #实例化对象 36 tfidf = TfidfVectorizer(stop_words=["溜达","顺着"]) 37 # 统计词的重要性程度 38 x = tfidf.fit_transform(content_list) 39 40 # 获取统计的词 41 feature_names = tfidf.get_feature_names() 42 res = x.toarray() 43 print(feature_names) 44 print("res:\n",res)