豆瓣图书信息爬虫
一、定义数据库数据模型
create database doubandb;
create TABLE books(
id bigint(20) unsigned NOT NULL COMMENT ID号,
title varchar(255) DEFAULT NULL COMMENT 书名,
author varchar(64) DEFAULT NULL COMMENT 作者,
press varchar(255) DEFAULT NULL COMMENT 出版社,
original varchar(255) DEFAULT NULL COMMENT 原作者,
translator varchar(128) unsigned DEFAULT NULL COMMENT 页数,
imprint varchar(128) DEFAULT NULL COMMENT 出版年,
pricel double(6,2) unsigned DEFAULT NULL COMMENT 定价,
binding varchar(32) DEFAULT NULL COMMENT 装帧,
series varchar(128) DEFAULT NULL COMMENT 丛书,
isbn varchar(255) DEFAULT NULL COMMENT ISBN,
score varchar(255) DEFAULT NULL COMMENT 评分,
number int(10) unsigned DEFAULT NULL COMMENT 评论人数,
PRIMARY KEY(id)
)
二、爬虫豆瓣图书所有标签信息
def main():
url='https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'
}
response=requests.get(url,headers=headers)
html=response.content.decode('utf-8')
doc=pq(html)
items=doc('table.tagCol tr td a')
link=redis.StrictRedis(host='localhost',port=6379,db=0)
for a in items.items():
tag=a.attr.href
link.lpush('book:tag_urls',tag)
sa
三、master主机scrapy爬取图书url信息
1、items.py
class MasterItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
url=scrapy.Field()
2.修改setting.py配置信息
ROBOTSTXT_OBEY=false
CONCURRENT_REQUESTS=1#修改线程数,防止被反爬识别
#设置去重组件,使用的是scrapy_redis的去重组件,而不再使用scrapy框架自己的去重组件了
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#设置调度器,使用的是scrapy_redis重写的调度器,而不再使用scrapy框架自带的调度器了
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
设置任务队列的模式,SpiderPriorityQueue是scrapy_redis默认使用的队列模式(有自己的优先级)
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
3.修改book.py
class BookSpider(scrapy.Spider):
name = 'book'
allowed_domains = ['book.douban.com']
#start_urls = ['http://book.douban.com/']
base_url='http://book.douban.com/'
def start_requests(self):
r=redis.Redis(host=self.settings.get('REDIS_HOST'),port=self.settings.get('REDIS_PORT'),db='db0',)
while r.llen('book:tag_urls'):
tag=r.lpop('book:tag_urls')
url=self.base_url+quote(tag)
yield Request(url=url,callback=self.parse,dont_filter=True)
def parse(self, response):
print(response.url)
lists=response.css()
if lists:
for i in lists:
item=MasterItem()
item['url']=i
yield item
next_url=''
if next_url:
url=response.urljoin(next_url)
yield Request(url=url, callback=self.parse, dont_filter=True)
四、slave从机scrapy爬取图书详情页