1.爬取数据需要的类库

import requests
import re
from bs4 import BeautifulSoup
import jieba.analyse
from PIL import Image,ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator

2.安装wordcloud库时候回发生报错

解决方法是:

  • 安装提示报错去官网下载vc++的工具,但是安装的内存太大只是几个G
  • 去https://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud下载whl文件,选取对应python的版本号和系统位数

3.爬取的基本思路

查看网站的html节点,爬取虎扑NBA新闻的标题和内容页,将爬取的内容保存为txt文件,对其进行分词,生成词云。

爬取1万2千条数据,共三百万字(最初我也不知道这么多)

import requests
import re
from bs4 import BeautifulSoup
import jieba.analyse
from PIL import Image,ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator
url ='https://voice.hupu.com/nba/1'
# 获得虎扑网nba新闻前12000条信息的标题和内容
def AlltitleAndUrl(url):
    j=0
    reslist = requests.get(url)
    reslist.encoding = 'utf-8'
    soup_list = BeautifulSoup(reslist.text, 'html.parser')
    for news in soup_list.select('li'):  # 首页
        if len(news.select('h4')) > 0:
            j=j+1
            print(j)
            # 标题
            title = news.find('h4').text
            href=news.find('h4').a['href']
            reslist = requests.get(href)
            reslist.encoding = 'utf-8'
            soup = BeautifulSoup(reslist.text, 'html.parser')
            context=soup.select('div .artical-main-content')[0].text
            f = open('dongman.txt', 'a', encoding='utf-8')
            f.write(title)
            f.write(context)
            f.close()
            print("文章标题:" + title)
            print(context)
            # print('https://voice.hupu.com/nba/%s' %i)

    # 后面的页数
    for i in range(2, 201):
        pages = i;
        nexturl = 'https://voice.hupu.com/nba/%s' % (pages)
        # nexturl = '%s%s%s' % (head, pages, tail)
        newcontent = requests.get(nexturl)
        newcontent.encoding = 'utf-8'
        soup_alllist = BeautifulSoup(newcontent.text, 'html.parser')

        for news in soup_list.select('li'):
            if len(news.select('h4')) > 0:
                j = j + 1
                # 标题
                title = news.find('h4').text
                href = news.find('h4').a['href']
                reslist = requests.get(href)
                reslist.encoding = 'utf-8'
                soup = BeautifulSoup(reslist.text, 'html.parser')
                context = soup.select('div .artical-main-content')[0].text
                f = open('dongman.txt', 'a', encoding='utf-8')
                f.write(title)
                f.write(context)
                f.close()
                print("文章标题:" + title)
                print(context)
                print(j)


def getWord():
    lyric = ''
    f = open('3.txt', 'r', encoding='utf-8')
    # 将文档里面的数据进行单个读取,便于生成词云
    for i in f:
        lyric += f.read()
    #     进行分析
    result = jieba.analyse.textrank(lyric, topK=2000, withWeight=True)
    keywords = dict()
    for i in result:
        keywords[i[0]] = i[1]
    print(keywords)

    # 获取词云生成所需要的模板图片
    image = Image.open('body.png')
    graph = np.array(image)
    # 进行词云的设置
    wc = WordCloud(font_path='./fonts/simhei.ttf',  background_color='White',max_words=230, mask=graph, random_state=30,scale=1.5)
    wc.generate_from_frequencies(keywords)
    image_color = ImageColorGenerator(graph)
    plt.imshow(wc)
    plt.imshow(wc.recolor(color_func=image_color))
    plt.axis("off")
    plt.show()
    wc.to_file('dream.png')


getWord()
AlltitleAndUrl(url)

数据截图:

结果截图:

posted on 2018-04-22 23:27  224杨晓潮  阅读(726)  评论(1编辑  收藏  举报