豆瓣爬虫
setting.py
BOT_NAME = 'doubanbook' SPIDER_MODULES = ['doubanbook.spiders'] NEWSPIDER_MODULE = 'doubanbook.spiders' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0' FEED_URI = u'file:///E://douban3.csv' FEED_FORMAT = 'CSV'
main.py
# -*- coding: UTF-8 -*- from scrapy import cmdline cmdline.execute("scrapy crawl dbbook".split())
items.py
import scrapy class DoubanbookItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() rate = scrapy.Field() author = scrapy.Field()
dbbook.py
# -*- coding: utf-8 -*- import scrapy import re from doubanbook.items import DoubanbookItem class DbbookSpider(scrapy.Spider): name = "dbbook" #allowed_domains = ["www.douban.com/doulist/1264675/"] start_urls = ( 'https://www.douban.com/doulist/1264675//', ) def parse(self, response): item = DoubanbookItem() selector = scrapy.Selector(response) books = selector.xpath('//div[@class="bd doulist-subject"]') for each in books: title = each.xpath('div[@class="title"]/a/text()').extract()[0] rate = each.xpath('div[@class="rating"]/span[@class="rating_nums"]/text()').extract()[0] author = re.search(r'<div class="abstract">(.*?)<br',each.extract(),re.S).group(1) title = title.replace(' ', '').replace('\n', '') author = author.replace(' ', '').replace('\n', '') item['title'] = title.encode('utf-8') item['rate'] =rate item['author'] = author.encode('utf-8') yield item nextp = selector.xpath('//span[@class="next"]/link/@href').extract() if nextp: next = nextp[0] print(next) yield scrapy.http.Request(next,callback=self.parse)