爬去当当书籍信息
多台机器同时爬取,共用一个redis记录 scrapy_redis
带爬取的request对象储存在redis中,每台机器读取request对象并删除记录,经行爬取。实现分布式爬虫
import scrapy from scrapy_redis.spiders import RedisSpider from copy import deepcopy class DangdangSpider(RedisSpider): name = 'dangdang' allowed_domains = ['dangdang.com'] # 开始爬虫,会从redis的key中读取start_url. redis_key = "dangdang" # lpush dangdang 'http://book.dangdang.com/' def parse(self, response): # 大分类 div_list = response.xpath("//div[@class='con flq_body']/div")[:-4] print(len(div_list), 'duoshao') for div in div_list: item = {} item['b_cate'] = div.xpath("./dl/dt//text()").extract() item['b_cate'] = [i.strip() for i in item['b_cate'] if len(i.strip())>0] # 过滤掉空字符 print('b_cate:', item['b_cate']) # 中间分类 if item['b_cate'] == ['创意文具']: print(item['b_cate'], "pass......") item['m_cate'] = None item['s_cate_url'] = div.xpath("./dl/dt/a/@ddt-src").extract_first() print('s_cate_url:', item['m_cate']) # yield scrapy.Request( # item['s_cate_url'], # callback=self.parse_special, # meta={'item': deepcopy(item)} # ) else: dl_list = div.xpath(".//dl[@class='inner_dl']") for dl in dl_list: item['m_cate'] = dl.xpath("./dt//text()").extract() item['m_cate'] = [i.strip() for i in item['m_cate'] if len(i.strip())>0] # 小分类 dd_list = dl.xpath("./dd") for dd in dd_list: item['s_cate'] = dd.xpath("./a/@title").extract_first() item['s_cate_url'] = dd.xpath("./a/@ddt-src").extract_first() # 小分类的所有书籍 if item['s_cate_url'] is not None: yield scrapy.Request( item['s_cate_url'], callback=self.parse_books, meta={'item': deepcopy(item)} ) def parse_special(self, response): ''' 文具信息 ''' pass def parse_books(self, response): item = response.meta['item'] # 当前小分类的书籍 li_list = response.xpath("//ul[@class='list_aa ']/li") if li_list is not None: for li in li_list: try: item['book_price'] = li.xpath(".//span[@class='num']/text()").extract_first() + \ li.xpath(".//span[@class='tail']/text()").extract_first() except: item['book_price'] = 'Unknown' item['book_url'] = li.xpath("./a/@href").extract_first() if item['book_url'] is not None: yield scrapy.Request( item['book_url'], callback=self.parse_book_detail, meta={'item': deepcopy(item)} ) def parse_book_detail(self, response): item = response.meta['item'] item['book_name'] = response.xpath("//div[@class='name_info']/h1/img/text()").extract_first() item['book_desc'] = response.xpath("//span[@class='head_title_name']/text()").extract_first() # 这一本书籍的详细信息 span_list = response.xpath("//div[@class='messbox_info']/span") item['book_author'] = span_list.xpath("./span[1]/a/text()").extract() # 可能多个作者 item['publisher'] = span_list.xpath("./span[2]/a/text()").extract_first() item['pub_date'] = span_list.xpath("./span[3]/text()").extract_first() print(item) # yield item