爬取豆瓣网评论最多的书籍

相信很多人都有书荒的时候,想要找到一本合适的书籍确实不容易,所以这次利用刚学习到的知识爬取豆瓣网的各类书籍,传送门https://book.douban.com/tag/?view=cloud

首先是这个程序的结构,html_downloader是html下载器,html_outputer是导出到Excel表,html_parser是解析页面,make_wordcloud是制作词云,spided_main是程序入口,url_manager是URL管理器,有兴趣的童鞋可以去慕课网看paython基础爬虫课程。

主要实现思路是先请求下载需要的html,解析得到目标URL并存储到URL管理器中,再从URL管理器中获取得到URL,发送请求,解析得到需要的信息内容,导出到Excel表格,再重Excel表中获取数据进行分析得到词云。

html_downloader:

在这里我使用的是urllib.request进行请求,之前有试过用request进行请求,但是爬取了几百页就被封了ip,所以弃用request。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# -*- coding:utf8 -*-
import urllib.request
from urllib.parse import quote
import string
 
 
class HtmlDownloader(object):
 
    def download(self,url):
        if url is None:
            return  None
        s = quote(url, safe=string.printable) #url里有中文需要添加这一句,不然乱码
        response = urllib.request.urlopen(s)
 
        if response.getcode()!= 200:
            return None
 
        return  response.read()  #返回内容

 

通过分析豆瓣网的结构,可以看到,我们首先传进去的是总的图书分类,但是我们需要的是每一个分类里面的图书信息。所以我们需要得到每一个分类的url,即base_url,再通过这个base_url去获取图书url,即detail_url。

url_manager:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# -*- coding:utf8 -*-
 
class UrlManage(object):
    def __init__(self):
        self.base_urls = set()  #基本分类的URL
        self.detail_urls = set() #详细内容页的URL
        self.old_base_urls = set() #已经爬取过的url
        self.old_detail_urls = set()#已经爬取过的url<br>  #添加单个url
    def add_base_url(self,url):
        if url is None:
            return
        if url not in self.base_urls and url not in self.old_base_urls:
            self.base_urls.add(url)
 
    def add_detail_url(self,url):
        if url is None:
            return
        if url not in self.detail_urls and url not in self.old_detail_urls:
            self.detail_urls.add(url)
            # print(self.detail_urls)
 
    # 添加多个url
    def add_new_detail_urls(self, urls):
        if urls is None or len(urls) == 0:
            return
        for url in urls:
            self.add_detail_url(url)
 
    def add_new_base_urls(self, urls):
        if urls is None or len(urls) == 0:
            return
        for url in urls:
            self.add_base_url(url)
<br>  #判断是否还有url
    def has_new_detail_url(self):
        return len(self.detail_urls)!=0
 
    def has_new_base_url(self):
        return len(self.base_urls)!=0
<br>  #得到一个新的url
    def get_base_url(self):
        new_base_url = self.base_urls.pop()
        self.old_base_urls.add(new_base_url)
        return new_base_url
 
    def get_detail_url(self):
        new_detail_url = self.detail_urls.pop()
        self.old_detail_urls.add(new_detail_url)
        return new_detail_url

 

解析器 html_parser:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# -*- coding:utf8 -*-
import re
from urllib.parse import urlparse
from bs4 import BeautifulSoup
 
 
class HtmlParser(object):
    def soup(cont):
        soups = BeautifulSoup(cont, 'html.parser', from_encoding='utf-8')
        return soups
<br>  #得到具体的data数据
    def get_new_data(soup):
        dict = {}
        if (soup.select('.subject-list')[0].contents):
            li = soup.select('.subject-list')[0].select('.subject-item')
            di = {}
            for i in li:
                bookname = i.select('.info')[0].select('a')[0].attrs['title'# 书名
                comment = i.select('.clearfix')[0].select('.pl')[0].text
                comment = re.findall('\d+', comment)[0]
                di[bookname] = comment
        if di:  # 返回的字典不为空的时候
            dict.update(di)
        return dict
 
    # 得到详细内容的url
    def get_detail_url(base_url):
        detail_urls = set()
        for k in range(0, 501, 20):
            if (k == 0):
                urls = base_url
                # print(urls)
            else:
                urls = base_url + '?start={}&type=T'.format(k)
                # print(urls)
            detail_urls.add(urls)
        return detail_urls
 
    # 得到所有的baseurl
    def get_all_base_urls(soup):
        links = soup.select('.tagCol')[0].select('a')
        base_urls = set()
        for link in links:
            new_full_url = 'https://book.douban.com{}'.format(link.attrs['href'])
            # HtmlParser.get_detail_url(new_full_url)
            base_urls.add(new_full_url)
        return base_urls
 
 
    def parser(cont):
        soup = BeautifulSoup(cont, 'html.parser', from_encoding='utf-8')
        base_urls = HtmlParser.get_all_base_urls(soup)
        return base_urls

  

spided_main:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# -*- coding:utf8 -*-
from douban_spider2 import url_manager, html_downloader, html_parser, html_outputer
 
class SpiderMain(object):
    def __init__(self):
        self.urls = url_manager.UrlManage()
        self.downloader = html_downloader.HtmlDownloader()
        self.htmlparser = html_parser.HtmlParser
        self.outputer = html_outputer.HtmlOutputer()
 
    def craw(self,root_url):
        count = 1
        dictdata = {}
        cont = self.downloader.download(root_url)
        base_urls = self.htmlparser.parser(cont)
        self.urls.add_new_base_urls(base_urls)
        while self.urls.has_new_base_url():
            try:
                base_url = self.urls.get_base_url()
                detail_urls = self.htmlparser.get_detail_url(base_url)
                self.urls.add_new_detail_urls(detail_urls)
            except:
                print('craw failed')
 
        while self.urls.has_new_detail_url():
            try:
                detail_url = self.urls.get_detail_url()
                print ('crow %d : %s'%(count,detail_url))
                html_cont = self.downloader.download(detail_url)
                soup = self.htmlparser.soup(html_cont)
                dict = self.htmlparser.get_new_data(soup)
                dictdata.update(dict)
                if count == 1000:    #因为之前有被封过ip,所以这里先爬取前1000条detail_url的内容
                    break
 
                count = count + 1
            except:
                print ('craw failed')
 
        self.outputer.output_excel(dictdata)
 
<br>#程序入口
if __name__=="__main__"
    url = 'https://book.douban.com/tag/?view=cloud' 
    obj_spider = SpiderMain()
    obj_spider.craw(url)

  

html_outputer:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# -*- coding:utf8 -*-
import xlwt  #写入Excel表的库
 
class HtmlOutputer(object):
    def __init__(self):
        self.datas =[]
 
    def output_excel(self, dict):
        di = dict
        wbk = xlwt.Workbook(encoding='utf-8')
        sheet = wbk.add_sheet("wordCount"# Excel单元格名字
        k = 0
        for i in di.items():
            sheet.write(k, 0, label=i[0])
            sheet.write(k, 1, label=i[1])
            k = k + 1
        wbk.save('wordCount.xls'# 保存为 wordCount.xls文件  

 

导出的Excel表格格式为,一共导出15261条记录

 

 

make_wordcloud:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# -*- coding:utf8 -*-
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import xlrd
from PIL import Image,ImageSequence
import numpy as np
 
file = xlrd.open_workbook('wordCount.xls')
sheet = file.sheet_by_name('wordCount')
list = {}
for i in range(sheet.nrows):
    rows = sheet.row_values(i)
    tu = {}
    tu[rows[0]]= int(rows[1])
    list.update(tu)
print(list)
 
image= Image.open('./08.png')
graph = np.array(image)
wc = WordCloud(font_path='./fonts/simhei.ttf',background_color='white',max_words=20000, max_font_size=50, min_font_size=1,mask=graph, random_state=100)
wc.generate_from_frequencies(list)
plt.figure()
# 以下代码显示图片
plt.imshow(wc)
plt.axis("off")
plt.show()

 

爬过的坑:

当定义的类有构造函数时候,调用时一定要加上括号,如 f =  html_downloader.HtmlDownloader().download(),而不是 f=  html_downloader.HtmlDownloader.download(),不然就会一直报错,类似于TypeError: get_all_base_urls() takes 1 positional argument but 2 were given。

 

 

生成词云的背景图片我选用的是

最后的做出由15261本书形成的词云

       本次爬虫只是针对图书类热门评论而做出的词云,可以看到涵盖所有分类的书籍里最热门评论的有解忧杂货店,白夜行等,据此我们可以选取比较热门的图书进行阅读,也可以根据此结果再做进一步的分析,获取热门书籍中的评论进行分析人们对于某本书的评价关键词,从而进一步的了解这本图书所描述的内容。

posted @   247李嘉嘉  阅读(870)  评论(0编辑  收藏  举报
努力加载评论中...
编辑推荐:
· SQL Server 内存占用高分析
· .NET Core GC计划阶段(plan_phase)底层原理浅谈
· .NET开发智能桌面机器人:用.NET IoT库编写驱动控制两个屏幕
· 用纯.NET开发并制作一个智能桌面机器人:从.NET IoT入门开始
· 一个超经典 WinForm,WPF 卡死问题的终极反思
阅读排行:
· 支付宝事故这事儿,凭什么又是程序员背锅?有没有可能是这样的...
· 在线客服系统 QPS 突破 240/秒,连接数突破 4000,日请求数接近1000万次,.NET 多
· C# 开发工具Visual Studio 介绍
· 在 Windows 10 上实现免密码 SSH 登录
· C#中如何使用异步编程
点击右上角即可分享
微信分享提示