词云制作

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import requests
import json
import jieba
import binascii
from urllib.parse import urlencode
import sys
def get_json(url):

  headers = {

'cookie': 'd_c0="AHBkqIQryw2PTouoMHWcxeuH1TbpVnfySVU=|1529752806"; _zap=83268240-1571-4dd8-a7d0-52d2c5148795;       _xsrf=PB8R3d9skggRQW2MQBu2AfAOL9g5IETF; q_c1=3b96c96861f3458db95a2fa70fc3fb65|1532427518000|1529752806000; tgw_l7_route=1c2b7f9548c57cd7d5a535ac4812e20e; l_n_c=1; l_cap_id="YzVmZjE3NDNjZDY1NDA1YWJhZTVlOTTMwOTFiYjQ=|1533105697|aa6d722a350fe7fd049fadfb92dcdc363946f1e9"; r_cap_id="NzE5OTI0OGFmMDQ2NGExNjllOWEzODhiN2YxZDM4MDk=|1533105697|3fb366cf6109df0d32807ebb2acc2997edb293c6"; cap_id="Zjg3MmJhYTRkMjE4NGU4NmE5MGM4ZDBhMDQxMjc3ZDM=|1533105697|279da2af57248cc79a4fa522c0343e4f999c2a25"; n_c=1',

  'referer': 'https://www.zhihu.com/question/287656023',

  'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'

}

  response = requests.get(url, headers=headers)

  response.encoding = response.apparent_encoding

  return response.content
def get_comments(code_json):
  json_dict = json.loads(code_json)
  for item in json_dict['data']:
  # 16进制转化为字符串
    comment = item['content'].encode('utf-8')
    comment = binascii.b2a_hex(comment)
    comment = binascii.a2b_hex(comment).decode("utf8")
    yield comment
def get_comments(code_json):
  json_dict = json.loads(code_json)
  for item in json_dict['data']:
# 16进制转化为字符串
    comment = item['content'].encode('utf-8')
    comment = binascii.b2a_hex(comment)
    comment = binascii.a2b_hex(comment).decode("utf8")
    yield comment

def wordcloud(all_comments):
# 对句子进行分词,加载停用词
# 打开和保存文件时记得加encoding='utf-8'编码,不然会报错。

  def seg_sentence(sentence):
    sentence_seged = jieba.cut(sentence.strip(), cut_all=False) # 精确模式
    stopwords = [line.strip() for line in open('C:\\Users\\12462\\Desktop\\stopwords.txt', 'r', encoding='utf-8').readlines()] # 这里加载停用词的路径
  outstr = ''
  for word in sentence_seged:
    if word not in stopwords:
      if word != '\t':
        outstr += word
        outstr += " "
  return outstr

for line in all_comments:
  line_seg = seg_sentence(line) # 这里的返回值是字符串
  with open('outputs.txt', 'a', encoding='utf-8') as f:
  f.write(line_seg + '\n')

data = open('outputs.txt', 'r', encoding='utf-8').read()
my_wordcloud = WordCloud(
  background_color='white', #设置背景颜色
  max_words=200, #设置最大实现的字数
  font_path='c:\\windows\\Fonts\\simhei.ttf', ###设置字体格式,如不设置显示不了中文,这里一定要选用电脑自带的字体,
  #有的时候出错显示Traceback (most recent call last):和OSError: cannot open resource主要是因为字体不在你电脑的库中
  ).generate(data)
  plt.figure()
  plt.imshow(my_wordcloud)
  plt.axis('off')
  plt.show() # 展示词云

 

def main():
  comment_list = []
  for i in range(0,800,20):
  url = "https://www.zhihu.com/api/v4/answers/457413146/comments?"
data = {
  'include': 'data[*].author,collapsed,reply_to_author,disliked,content,voting,vote_count,is_parent_author,is_author',
  'order': 'normal',
  'limit': '20',
  'offset': i,
  'status': 'open'
}
data = urlencode(data)
  url = url + data
  code_json = get_json(url)
  sys.stdout.write(" 已下载:%.3f%%" % float(i/800*100) + '\r')#不能同时两行刷新
  sys.stdout.flush()
for reslut in get_comments(code_json):
#print(reslut)
  comment_list.append(reslut)
  wordcloud(comment_list)

if __name__ == '__main__':
main()

posted @ 2019-11-06 23:38  annonmouse  阅读(371)  评论(0编辑  收藏  举报