Scrapy爬取大众点评

Posted on 2018-04-17 23:20 风吹白杨的安妮阅读(2082) 评论(0) 编辑收藏举报

最近想吃烤肉，所以想看看深圳哪里的烤肉比较好吃，于是自己就开始爬虫咯。这是个静态网页，有反爬机制，我在setting和middlewares设置了反爬措施

Setting

# -*- coding: utf-8 -*-

# Scrapy settings for dazhong project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'dazhong'

SPIDER_MODULES = ['dazhong.spiders']
NEWSPIDER_MODULE = 'dazhong.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 10
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'dazhong.middlewares.DazhongSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None, 
    'dazhong.middlewares.MyUserAgentMiddleware': 400,
}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'dazhong.pipelines.DazhongPipeline': 200,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

MY_USER_AGENT = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36']

ITEM

import scrapy

class DazhongItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    location = scrapy.Field()
    people = scrapy.Field()
    money = scrapy.Field()
    taste = scrapy.Field()
    envir = scrapy.Field()
    taste_score = scrapy.Field()
    service = scrapy.Field()

Spider：

# -*- coding: utf-8 -*-
import scrapy
import re
from bs4 import BeautifulSoup
from scrapy.http import Request
from dazhong.items import DazhongItem

class DzSpider(scrapy.Spider):
    name = 'dz'
    allowed_domains = ['www.dianping.com']
    #headers = {'USER-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36'}
    #custom_settings = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36'}
    first_url = 'http://www.dianping.com/shenzhen/ch10/g114'
    last_url = 'p'
    def start_requests(self):
        for i in range(1,45):
            url = self.first_url + self.last_url + str(i)
            yield Request(url,self.parse)    
    def parse(self, response):
        soup = BeautifulSoup(response.body.decode('UTF-8'),'lxml')
        for site in soup.find_all('div',class_='txt'):
            item = DazhongItem()
            try:
                item['name'] = site.find('div',class_='tit').find({'h4'}).get_text()
                item['location'] = site.find('div',class_='tag-addr').find('span',class_='addr').get_text()
                item['people'] = site.find('div',class_='comment').find('a').find('b').get_text()
                item['money'] = site.find('div',class_='comment').find_all('a')[1].find('b').get_text()
                item['taste'] = site.find('div',class_= 'tag-addr').find('a').find('span').get_text() 
                item['envir'] = site.find('span',class_= 'comment-list').find_all('span')[1].find('b').get_text()
                item['taste_score'] = site.find('span',class_= 'comment-list').find_all('span')[0].find('b').get_text()
                item['service'] = site.find('span',class_= 'comment-list').find_all('span')[2].find('b').get_text()
                yield item
            except:
                pass

PIPELINE：

from openpyxl import Workbook

class DazhongPipeline(object):  # 设置工序一
    def __init__(self):
        self.wb = Workbook()
        self.ws = self.wb.active
        self.ws.append(['店铺名称','地点','评论人数','平均消费','口味','环境评分','口味评分','服务评分',])  # 设置表头
    def process_item(self, item, spider):  # 工序具体内容
        line = [item['name'],item['location'],item['people'],item['money'],item['taste'],item['envir'],item['taste_score'],item['service']]  # 把数据中每一项整理出来
        self.ws.append(line)  # 将数据以行的形式添加到xlsx中
        self.wb.save('dazhong.xlsx')  # 保存xlsx文件
        return item
    def spider_closed(self, spider):
        self.file.close()

middlewares：

import scrapy
from scrapy import signals
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random

class MyUserAgentMiddleware(UserAgentMiddleware):
    def __init__(self, user_agent):
        self.user_agent = user_agent
    @classmethod
    def from_crawler(cls,crawler):
        return cls(
                user_agent = crawler.settings.get('MY_USER_AGENT')
            )
    def process_request(self, request, spider):
        agent = random.choice(self.user_agent)
        request.headers['User-Agent'] = agent

那些没有环境评分、服务评分数据的也就跳过了，爬来没意义

结果如下：

决定去吃姜虎东

刷新页面返回顶部

风吹白杨的安妮

公告

Scrapy爬取大众点评