使用Scrapy爬取豆瓣图书并存储在数据库

一、目标

  通过对Scrapy爬取项目的设计与实现,掌握Scrapy框架的用法和Mysql的基本操作,学会使用Scrapy框架爬取网页数据并保存至数据库

 

二、分析网页结构

  

 

三、创建Scrapy项目并命名为douban

1
scrapy startproject douban

  

四、编写或修改代码

1.修改settings.py

2、设置items.py

1
2
3
4
5
6
7
8
9
10
11
import scrapy
 
 
class DoubanItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    book_name = scrapy.Field()
    author = scrapy.Field()
    grade = scrapy.Field()
    count = scrapy.Field()
    introduction = scrapy.Field()

3、在spiders目录下新建一个doubanspider.py,编写如下代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import scrapy
from douban.items import DoubanItem
import time
 
class DoubanspiderSpider(scrapy.Spider):
    name = 'doubanspider'
    #过滤爬取的域名
    allowed_domains = ['douban.com']
    def start_requests(self):
        #起始url
        url = 'https://book.douban.com/tag/%E4%B8%AD%E5%9B%BD%E6%96%87%E5%AD%A6'
        yield scrapy.Request(url,callback = self.parse,dont_filter = True)
    
    def parse(self, response):
        item = DoubanItem()
        info_list = response.xpath('//div[@class="info"]')
        for info in info_list:
            #休眠1秒
            time.sleep(1)
            if info!=None and info!='':
                if (info.xpath('./h2/a/text()').extract_first()!=None) and (info.xpath('./div[@class="pub"]/text()').extract_first()!=None) and  (info.xpath('./div[2]/span[3]/text()').extract_first()!=None) and  (info.xpath('./p/text()').extract_first()!=None):             
                    item['book_name'] = info.xpath('./h2/a/text()').extract_first().strip()#去除空格 str.strip()
                    item['author'] = info.xpath('./div[@class="pub"]/text()').extract_first().strip().split('/')[0]
                    item['grade'] = info.xpath('./div[2]/span[2]/text()').extract_first()
                    item['count'] = info.xpath('./div[2]/span[3]/text()').extract_first().strip().replace('\n','')
                    # item['introduction'] = info.xpath('./p/text()').extract_first().replace('\n','').replace("'", "").replace('"', '')
                    item['introduction'] = info.xpath('./p/text()').extract_first().replace('\n','')
                else:
                    continue
            else:
                pass
            yield item
        #获取下一页的url
        next_temp_url = response.xpath("//div[@id='subject_list']/div[@class='paginator']/span[@class='next']/a/@href").extract_first()
        if next_temp_url:
            next_url = response.urljoin(next_temp_url)
            yield scrapy.Request(next_url)

  

4、尝试保存本地.csv文件

在与scrapy.cfg同目录下,cmd运行如下命令:

1
scrapy crawl doubanspider -o doubanread.csv

 

 

5、创建MySQL数据库douban和数据表doubanread

复制代码

drop database if exists douban;
create database if not exists douban charset=gbk;
use douban;
create table doubanread(
id int(11) primary key not null auto_increment,
book_name varchar(255) default null,
author varchar(255) default null,
grade varchar(255) default null,
count varchar(255) default null,
introduction varchar(255) default null
) engine=innoDB auto_increment=1409 default charset=utf8;

复制代码

 

 

6、设置pipelines.py(这里踩了个坑,千万别用MySQLdb!!!!!)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 
 
# useful for handling different item types with a single interfa数据库存在则删除mysqlce
#from itemadapter import ItemAdapter
# import MySQLdb
 
import pymysql
# # 加载settings文件
# from scrapy.utils.project import get_project_settings
 
 
class DoubanPipeline(object):
    # def process_item(self, item, spider):
    #     return item
    # def __init__(self):
    #     self.db = MySQLdb.connect('localhost','root','yourpwd','douban',charset='gbk')
    #     self.cursor = self.db.cursor()
     
    # def process_item(self, item, spider):
    #     book_name = item.get('book_name','N/A')
    #     author = item.get('author','N/A')
    #     grade = item.get('grade','N/A')
    #     count = item.get('count','N/A')
    #     introduction = item.get('introduction','N/A')
    #     sql = "insert into doubanread(book_name,author,grade,count,introduction) values ('%s','%s','%s','%s','%s')"
    #     self.cursor.execute(sql,(book_name,author,grade,count,MySQLdb.escape_string(introduction).decode('gbk')))
    #     # self.cursor.execute(sql,(book_name,author,grade,count,introduction))
    #     print("========================================插入成功==========================================")
    #     self.db.commit()
    # def close_spider(self,spider):     
    #     self.cursor.close()
    #     self.db.close()
 
    def __init__(self):
        self.host = 'localhost'
        self.port = 3306
        self.user = 'root'
        self.password = 'yourppd'
        self.db = 'douban'
        self.charset = 'utf8'
        self.connet()
    def connet(self):
        self.conn = pymysql.connect(
            host=self.host,port=self.port,user=self.user,password=self.password,db=self.db,charset=self.charset)
        #创建游标
        self.cursor = self.conn.cursor()
 
    def process_item(self, item, spider):
        sql = "insert into doubanread(book_name,author,grade,count,introduction) values ('{}','{}','{}','{}','{}')".format(item['book_name'],item['author'],item['grade'],item['count'],item['introduction'])
         # 执行sql语句
        self.cursor.execute(sql)
        # 提交事务
        self.conn.commit()
        print("=================================提交完成=======================================")
        return item
 
 
 
    def __del__(self):
        self.cursor.close()
        self.conn.close()

  这里有个细节,settings.py下 记得开启通道

1
2
3
ITEM_PIPELINES = {
   'douban.pipelines.DoubanPipeline': 300,
}

  

 7、数据如下:

 

posted @   AubeLiang  阅读(587)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· Docker 太简单,K8s 太复杂?w7panel 让容器管理更轻松!
点击右上角即可分享
微信分享提示