scrapy

1. 创建项目

scrapy startproject <projectName>

2. 创建爬虫(项目目录下 cd <projectName>)

scrapy genspider <spiderName> <start_url>

3. 运行爬虫(项目目录下)

scrapy crawl <spiderName>

======

爬虫技巧

设置setting.py

1. 设置不遵循 ROBOTSTXT_OBEY

ROBOTSTXT_OBEY = False

2. 设置延时

DOWNLOAD_DELAY = 3

3. 设置 USER_AGENT 和 DEFAULT_REQUEST_HEADERS

1 # Crawl responsibly by identifying yourself (and your website) on the user-agent
2 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0;WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
3 
4 # Override the default request headers:
5 DEFAULT_REQUEST_HEADERS = {
6    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
7    'Accept-Language': 'en',
8    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
9 }

4. 设置中间件（设置爬虫的headers和proxoy）

　　4.1 下载中间件设置开启

// project_dir/settings.py
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'ADemo.middlewares.ProxyMilldeware': 301,    # 下载中间件 [调度器与引擎之间的数据管道]
}

 // project_dir/middlewares.py

 1 class ProxyMilldeware(object):   # request 来自调度器 [用去处理或配置到下载器之前的 request 对象]
 2     def process_request(self, request, spider):
 3         # print('*'*100)
 4         request.meta['proxy'] = 'http://127.0.0.1:1080'
 5         request.headers.setdefault('User-Agent', '在这里设置成你的浏览器用户代理')
           request.cookies = {'在这里设置你的cookies， 以字典的格式'}
 6 
 7     def process_response(self, request, response, spider):  # response 来自下载器 [用于处理下载器下载过来的 response 对象]
 8         print(response.status)
 9         print(response.text)
10         return response

　　4.2 爬虫中间件设置开启

// project_dir/settings.py
1 # Enable or disable spider middlewares
2 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
3 SPIDER_MIDDLEWARES = {5     'ADemo.middlewares.SpiderMiddleware': 543, # 爬虫中间件  [spider与引擎之间的数据管道]
6 }

5. 设置是否允许重定向，重试次数

// project_dir/settings.py

1 # 设置重试次数
2 RETRY_ENABLED = False
3 # RETRY_TIMES = 1
4 # RETRY_HTTP_CODECS = [400, 500]
5 
6 # 设置重定向
7 REDIRECT_ENABLED = False

6. 日志

 1 {'downloader/request_bytes': 288,   　　   　　　　　　　　　　　　　　　　　 # 请求字节数
 2  'downloader/request_count': 1,　　　　　　　　　　　　　　　　　　　　　　　　 # 请求次数
 3  'downloader/request_method_count/GET': 1,　　　　　　　　　　　　　　　　　  # get 请求数
 4  'downloader/response_bytes': 71860,　　　　　　　　　　　　　　　　　　　　　 # 响应字节数
 5  'downloader/response_count': 1,　　　　　　　　　　　　　　　　　　　　　　　　# 响应次数
 6  'downloader/response_status_count/200': 1,　　　　　　　　　　　　　　　　　 # 响应成功状态码为200的次数
 7  'finish_reason': 'finished',　　　　　　　　　　　　　　　　　　　　　　　　　　# 结束原因： 运行完成！
 8  'finish_time': datetime.datetime(2018, 1, 10, 6, 35, 53, 633178),      # 程序结束时间
 9  'log_count/DEBUG': 2,　　　　　　　　　　　　　　　　　　
10  'log_count/INFO': 7,
11  'response_received_count': 1,
12  'scheduler/dequeued': 1,
13  'scheduler/dequeued/memory': 1,
14  'scheduler/enqueued': 1,
15  'scheduler/enqueued/memory': 1,
16  'start_time': datetime.datetime(2018, 1, 10, 6, 35, 51, 585036)}

====== settings.py 示例文件 ======

  1 # -*- coding: utf-8 -*-
  2 
  3 # Scrapy settings for ADemo project
  4 #
  5 # For simplicity, this file contains only settings considered important or
  6 # commonly used. You can find more settings consulting the documentation:
  7 #
  8 #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 
 12 BOT_NAME = 'ADemo'
 13 
 14 SPIDER_MODULES = ['ADemo.spiders']
 15 NEWSPIDER_MODULE = 'ADemo.spiders'
 16 
 17 
 18 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0;WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
 20 
 21 # Obey robots.txt rules
 22 ROBOTSTXT_OBEY = False  # 一般设为False
 23 
 24 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 #CONCURRENT_REQUESTS = 32
 26 
 27 # Configure a delay for requests for the same website (default: 0)
 28 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 29 # See also autothrottle settings and docs
 30 DOWNLOAD_DELAY = 3
 31 # The download delay setting will honor only one of:
 32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 33 #CONCURRENT_REQUESTS_PER_IP = 16
 34 
 35 # Disable cookies (enabled by default)
 36 #COOKIES_ENABLED = False
 37 
 38 # Disable Telnet Console (enabled by default)
 39 #TELNETCONSOLE_ENABLED = False
 40 
 41 
 42 # Override the default request headers:
 43 DEFAULT_REQUEST_HEADERS = {
 44    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 45    'Accept-Language': 'en',
 46    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
 47 }
 48 
 49 # Enable or disable spider middlewares
 50 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 51 SPIDER_MIDDLEWARES = {
 52     # 'ADemo.middlewares.AdemoSpiderMiddleware': 543,
 53     # 'ADemo.middlewares.SpiderMiddleware': 543,
 54 }
 55 
 56 # Enable or disable downloader middlewares
 57 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 58 DOWNLOADER_MIDDLEWARES = {
 59     'ADemo.middlewares.ProxyMilldeware': 301,
 60 }
 61 
 62 # Enable or disable extensions
 63 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 64 #EXTENSIONS = {
 65 #    'scrapy.extensions.telnet.TelnetConsole': None,
 66 #}
 67 
 68 # Configure item pipelines
 69 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 70 #ITEM_PIPELINES = {
 71 #    'ADemo.pipelines.AdemoPipeline': 300,
 72 #}
 73 
 74 # Enable and configure the AutoThrottle extension (disabled by default)
 75 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 76 #AUTOTHROTTLE_ENABLED = True
 77 # The initial download delay
 78 #AUTOTHROTTLE_START_DELAY = 5
 79 # The maximum download delay to be set in case of high latencies
 80 #AUTOTHROTTLE_MAX_DELAY = 60
 81 # The average number of requests Scrapy should be sending in parallel to
 82 # each remote server
 83 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 84 # Enable showing throttling stats for every response received:
 85 #AUTOTHROTTLE_DEBUG = False
 86 
 87 # Enable and configure HTTP caching (disabled by default)
 88 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 89 #HTTPCACHE_ENABLED = True
 90 #HTTPCACHE_EXPIRATION_SECS = 0
 91 #HTTPCACHE_DIR = 'httpcache'
 92 #HTTPCACHE_IGNORE_HTTP_CODES = []
 93 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 94 
 95 # 设置重试次数
 96 RETRY_ENABLED = True
 97 RETRY_TIMES = 2
 98 # RETRY_HTTP_CODECS = [320, 500]
 99 
100 # 设置重定向
101 # REDIRECT_ENABLED = False


// 分布式爬虫，配置 redis

# SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# REDIS_URL = "redis://user:password@host:port"

cookies的处理 [将前端浏览器request请求头中的headers字符串中的cookies字符串复制给cookies]

1 cookies_string = '__utma=160444991.1773391101.1513841173.1515472372.1515564585.4; __utmc=160444991; __utmz=160444991.1515564585.4.4.utmcsr=baidu|utmccn=(organic)|utmcmd=organic'
2 cookies = {}
3 for item in cookies_string.split(';'):
4     key = item.split('=')[0].strip()
5     value = item.split('=')[1].strip()
6     cookies[key] = value
7 print(cookies)

headers的处理 [将前端浏览器request请求头中的headers字符串复制给headers_string]

1 headers_string = ''
2 headers = {}
3 for item in headers_string.split('\n'):
4     lst = item.split(':')
5     key = lst[0].strip()
6     value = lst[1]
7     headers[key] = value.strip()
8 print(headers)

posted @ 2018-01-10 12:02 yugengde 阅读(386) 评论(0) 编辑收藏举报

刷新页面返回顶部

yugengde

scrapy

公告