词云制作

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import requests
import json
import jieba
import binascii
from urllib.parse import urlencode
import sys
def get_json(url):

　　headers = {

'cookie': 'd_c0="AHBkqIQryw2PTouoMHWcxeuH1TbpVnfySVU=|1529752806"; _zap=83268240-1571-4dd8-a7d0-52d2c5148795; 　　　　　　_xsrf=PB8R3d9skggRQW2MQBu2AfAOL9g5IETF; q_c1=3b96c96861f3458db95a2fa70fc3fb65|1532427518000|1529752806000; tgw_l7_route=1c2b7f9548c57cd7d5a535ac4812e20e; l_n_c=1; l_cap_id="YzVmZjE3NDNjZDY1NDA1YWJhZTVlOTTMwOTFiYjQ=|1533105697|aa6d722a350fe7fd049fadfb92dcdc363946f1e9"; r_cap_id="NzE5OTI0OGFmMDQ2NGExNjllOWEzODhiN2YxZDM4MDk=|1533105697|3fb366cf6109df0d32807ebb2acc2997edb293c6"; cap_id="Zjg3MmJhYTRkMjE4NGU4NmE5MGM4ZDBhMDQxMjc3ZDM=|1533105697|279da2af57248cc79a4fa522c0343e4f999c2a25"; n_c=1',

　　'referer': 'https://www.zhihu.com/question/287656023',

　　'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'

}

　　response = requests.get(url, headers=headers)

　　response.encoding = response.apparent_encoding

　　return response.content
def get_comments(code_json):
　　json_dict = json.loads(code_json)
　　for item in json_dict['data']:
　　# 16进制转化为字符串
　　　　comment = item['content'].encode('utf-8')
　　　　comment = binascii.b2a_hex(comment)
　　　　comment = binascii.a2b_hex(comment).decode("utf8")
　　　　yield comment
def get_comments(code_json):
　　json_dict = json.loads(code_json)
　　for item in json_dict['data']:
# 16进制转化为字符串
　　　　comment = item['content'].encode('utf-8')
　　　　comment = binascii.b2a_hex(comment)
　　　　comment = binascii.a2b_hex(comment).decode("utf8")
　　　　yield comment

def wordcloud(all_comments):
# 对句子进行分词，加载停用词
# 打开和保存文件时记得加encoding='utf-8'编码，不然会报错。

　　def seg_sentence(sentence):
　　　　sentence_seged = jieba.cut(sentence.strip(), cut_all=False) # 精确模式
　　　　stopwords = [line.strip() for line in open('C:\\Users\\12462\\Desktop\\stopwords.txt', 'r', encoding='utf-8').readlines()] # 这里加载停用词的路径
　　outstr = ''
　　for word in sentence_seged:
　　　　if word not in stopwords:
　　　　　　if word != '\t':
　　　　　　　　outstr += word
　　　　　　　　outstr += " "
　　return outstr

for line in all_comments:
　　line_seg = seg_sentence(line) # 这里的返回值是字符串
　　with open('outputs.txt', 'a', encoding='utf-8') as f:
　　f.write(line_seg + '\n')

data = open('outputs.txt', 'r', encoding='utf-8').read()
my_wordcloud = WordCloud(
　　background_color='white', #设置背景颜色
　　max_words=200, #设置最大实现的字数
　　font_path='c:\\windows\\Fonts\\simhei.ttf', ###设置字体格式，如不设置显示不了中文,这里一定要选用电脑自带的字体，
　　#有的时候出错显示Traceback (most recent call last):和OSError: cannot open resource主要是因为字体不在你电脑的库中
　　).generate(data)
　　plt.figure()
　　plt.imshow(my_wordcloud)
　　plt.axis('off')
　　plt.show() # 展示词云

def main():
　　comment_list = []
　　for i in range(0,800,20):
　　url = "https://www.zhihu.com/api/v4/answers/457413146/comments?"
data = {
　　'include': 'data[*].author,collapsed,reply_to_author,disliked,content,voting,vote_count,is_parent_author,is_author',
　　'order': 'normal',
　　'limit': '20',
　　'offset': i,
　　'status': 'open'
}
data = urlencode(data)
　　url = url + data
　　code_json = get_json(url)
　　sys.stdout.write(" 已下载:%.3f%%" % float(i/800*100) + '\r')#不能同时两行刷新
　　sys.stdout.flush()
for reslut in get_comments(code_json):
#print(reslut)
　　comment_list.append(reslut)
　　wordcloud(comment_list)

if __name__ == '__main__':
main()

posted @ 2019-11-06 23:38 annonmouse 阅读(385) 评论(0) 收藏举报

刷新页面返回顶部

十足の笨蛋

心若向阳，何惧悲伤

词云制作

公告