使用Scrapy爬取豆瓣图书并存储在数据库
一、目标
通过对Scrapy爬取项目的设计与实现,掌握Scrapy框架的用法和Mysql的基本操作,学会使用Scrapy框架爬取网页数据并保存至数据库
二、分析网页结构
三、创建Scrapy项目并命名为douban
scrapy startproject douban
四、编写或修改代码
1.修改settings.py
2、设置items.py
import scrapy class DoubanItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() book_name = scrapy.Field() author = scrapy.Field() grade = scrapy.Field() count = scrapy.Field() introduction = scrapy.Field()
3、在spiders目录下新建一个doubanspider.py,编写如下代码
import scrapy from douban.items import DoubanItem import time class DoubanspiderSpider(scrapy.Spider): name = 'doubanspider' #过滤爬取的域名 allowed_domains = ['douban.com'] def start_requests(self): #起始url url = 'https://book.douban.com/tag/%E4%B8%AD%E5%9B%BD%E6%96%87%E5%AD%A6' yield scrapy.Request(url,callback = self.parse,dont_filter = True) def parse(self, response): item = DoubanItem() info_list = response.xpath('//div[@class="info"]') for info in info_list: #休眠1秒 time.sleep(1) if info!=None and info!='': if (info.xpath('./h2/a/text()').extract_first()!=None) and (info.xpath('./div[@class="pub"]/text()').extract_first()!=None) and (info.xpath('./div[2]/span[3]/text()').extract_first()!=None) and (info.xpath('./p/text()').extract_first()!=None): item['book_name'] = info.xpath('./h2/a/text()').extract_first().strip()#去除空格 str.strip() item['author'] = info.xpath('./div[@class="pub"]/text()').extract_first().strip().split('/')[0] item['grade'] = info.xpath('./div[2]/span[2]/text()').extract_first() item['count'] = info.xpath('./div[2]/span[3]/text()').extract_first().strip().replace('\n','') # item['introduction'] = info.xpath('./p/text()').extract_first().replace('\n','').replace("'", "").replace('"', '') item['introduction'] = info.xpath('./p/text()').extract_first().replace('\n','') else: continue else: pass yield item #获取下一页的url next_temp_url = response.xpath("//div[@id='subject_list']/div[@class='paginator']/span[@class='next']/a/@href").extract_first() if next_temp_url: next_url = response.urljoin(next_temp_url) yield scrapy.Request(next_url)
4、尝试保存本地.csv文件
在与scrapy.cfg同目录下,cmd运行如下命令:
scrapy crawl doubanspider -o doubanread.csv
5、创建MySQL数据库douban和数据表doubanread
drop database if exists douban;
create database if not exists douban charset=gbk;
use douban;
create table doubanread(
id int(11) primary key not null auto_increment,
book_name varchar(255) default null,
author varchar(255) default null,
grade varchar(255) default null,
count varchar(255) default null,
introduction varchar(255) default null
) engine=innoDB auto_increment=1409 default charset=utf8;
6、设置pipelines.py(这里踩了个坑,千万别用MySQLdb!!!!!)
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interfa数据库存在则删除mysqlce #from itemadapter import ItemAdapter # import MySQLdb import pymysql # # 加载settings文件 # from scrapy.utils.project import get_project_settings class DoubanPipeline(object): # def process_item(self, item, spider): # return item # def __init__(self): # self.db = MySQLdb.connect('localhost','root','yourpwd','douban',charset='gbk') # self.cursor = self.db.cursor() # def process_item(self, item, spider): # book_name = item.get('book_name','N/A') # author = item.get('author','N/A') # grade = item.get('grade','N/A') # count = item.get('count','N/A') # introduction = item.get('introduction','N/A') # sql = "insert into doubanread(book_name,author,grade,count,introduction) values ('%s','%s','%s','%s','%s')" # self.cursor.execute(sql,(book_name,author,grade,count,MySQLdb.escape_string(introduction).decode('gbk'))) # # self.cursor.execute(sql,(book_name,author,grade,count,introduction)) # print("========================================插入成功==========================================") # self.db.commit() # def close_spider(self,spider): # self.cursor.close() # self.db.close() def __init__(self): self.host = 'localhost' self.port = 3306 self.user = 'root' self.password = 'yourppd' self.db = 'douban' self.charset = 'utf8' self.connet() def connet(self): self.conn = pymysql.connect( host=self.host,port=self.port,user=self.user,password=self.password,db=self.db,charset=self.charset) #创建游标 self.cursor = self.conn.cursor() def process_item(self, item, spider): sql = "insert into doubanread(book_name,author,grade,count,introduction) values ('{}','{}','{}','{}','{}')".format(item['book_name'],item['author'],item['grade'],item['count'],item['introduction']) # 执行sql语句 self.cursor.execute(sql) # 提交事务 self.conn.commit() print("=================================提交完成=======================================") return item def __del__(self): self.cursor.close() self.conn.close()
这里有个细节,settings.py下 记得开启通道
ITEM_PIPELINES = { 'douban.pipelines.DoubanPipeline': 300, }
7、数据如下: