爬虫性能相关
性能相关
在编写爬虫时,性能的消耗主要在IO请求中,当单进程单线程模式下请求URL时必然会引起等待,从而使得请求整体变慢。
1.同步执行:
1 import requests 2 3 def fetch_async(url): 4 response = requests.get(url) 5 return response 6 7 8 url_list = ['http://www.github.com', 'http://www.bing.com'] 9 10 for url in url_list: 11 fetch_async(url)
2.多线程执行(线程池)
from concurrent.futures import ThreadPoolExecutor import requests def fetch_async(url): response = requests.get(url) return response url_list = ['http://www.github.com', 'http://www.bing.com'] pool = ThreadPoolExecutor(5) for url in url_list: pool.submit(fetch_async, url) pool.shutdown(wait=True)
3.多线程+回调函数执行
from concurrent.futures import ThreadPoolExecutor import requests def fetch_async(url): response = requests.get(url) return response def callback(future): print(future.result()) url_list = ['http://www.github.com', 'http://www.bing.com'] pool = ThreadPoolExecutor(5) for url in url_list: v = pool.submit(fetch_async, url) v.add_done_callback(callback) pool.shutdown(wait=True)
4.多进程执行(进程池)
1 from concurrent.futures import ProcessPoolExecutor 2 import requests 3 4 def fetch_async(url): 5 response = requests.get(url) 6 return response 7 8 9 url_list = ['http://www.github.com', 'http://www.bing.com'] 10 pool = ProcessPoolExecutor(5) 11 for url in url_list: 12 pool.submit(fetch_async, url) 13 pool.shutdown(wait=True)
5.多进程+回调函数执行
1 from concurrent.futures import ProcessPoolExecutor 2 import requests 3 4 5 def fetch_async(url): 6 response = requests.get(url) 7 return response 8 9 10 def callback(future): 11 print(future.result()) 12 13 14 url_list = ['http://www.github.com', 'http://www.bing.com'] 15 pool = ProcessPoolExecutor(5) 16 for url in url_list: 17 v = pool.submit(fetch_async, url) 18 v.add_done_callback(callback) 19 pool.shutdown(wait=True)
通过上述代码均可以完成对请求性能的提高,对于多线程和多进行的缺点是在IO阻塞时会造成了线程和进程的浪费,所以异步IO成为首选:
asyncio是基于tcp协议的,不支持http,但是可以通过自己写请求头的方式来构造http请求。示例2中的host就是请求头的内容(包含contentTpye,cookies等等)
writer.drain()是代表可以发送数据了(send数据了,在等待过程中属于io阻塞),writer.write()表示将请求头和内容发送过去了,向服务器端表名自己的请求。
reader.read()是接收返回信息,也是io阻塞。谁先返回就先接收谁。
所以下面有人已经写好了一个模块,叫做aiohttp(封装了很多内容,在asyncio的基础上封装的一个支持http请求的模块)。
1 import asyncio 2 3 @asyncio.coroutine 4 def func1(): 5 print('before...func1......') 6 yield from asyncio.sleep(5) 7 print('end...func1......') 8 9 10 tasks = [func1(), func1()] 11 12 loop = asyncio.get_event_loop() 13 loop.run_until_complete(asyncio.gather(*tasks)) 14 loop.close()
1 import asyncio 2 3 4 @asyncio.coroutine 5 def fetch_async(host, url='/'): 6 print(host, url) 7 reader, writer = yield from asyncio.open_connection(host, 80) 8 9 request_header_content = """GET %s HTTP/1.0\r\nHost: %s\r\n\r\n""" % (url, host,) 10 request_header_content = bytes(request_header_content, encoding='utf-8') 11 12 writer.write(request_header_content) 13 yield from writer.drain() 14 text = yield from reader.read() 15 print(host, url, text) 16 writer.close() 17 18 tasks = [ 19 fetch_async('www.cnblogs.com', '/wupeiqi/'), 20 fetch_async('dig.chouti.com', '/pic/show?nid=4073644713430508&lid=10273091') 21 ] 22 23 loop = asyncio.get_event_loop() 24 results = loop.run_until_complete(asyncio.gather(*tasks)) 25 loop.close()
1 import aiohttp 2 import asyncio 3 4 5 @asyncio.coroutine 6 def fetch_async(url): 7 print(url) 8 response = yield from aiohttp.request('GET', url) 9 # data = yield from response.read() 10 # print(url, data) 11 print(url, response) 12 response.close() 13 14 15 tasks = [fetch_async('http://www.google.com/'), fetch_async('http://www.chouti.com/')] 16 17 event_loop = asyncio.get_event_loop() 18 results = event_loop.run_until_complete(asyncio.gather(*tasks)) 19 event_loop.close()
aiohttp是专门为asyncio而生的。
1 import asyncio 2 import requests 3 4 5 @asyncio.coroutine 6 def fetch_async(func, *args): 7 loop = asyncio.get_event_loop() #事件循环 8 future = loop.run_in_executor(None, func, *args) 9 response = yield from future 10 print(response.url, response.content) 11 12 13 tasks = [ 14 fetch_async(requests.get, 'http://www.cnblogs.com/wupeiqi/'), 15 #requests.get作为函数传到了上面,url作为参数 16 fetch_async(requests.get, 'http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091') 17 ] 18 19 loop = asyncio.get_event_loop() 20 results = loop.run_until_complete(asyncio.gather(*tasks)) 21 loop.close()
1 import gevent 2 3 import requests 4 from gevent import monkey 5 6 monkey.patch_all() #找到内部所有的socket进行替换,替换成它自己封装的socket,然后进行异步非阻塞和事件循环 7 8 9 def fetch_async(method, url, req_kwargs): 10 print(method, url, req_kwargs) 11 response = requests.request(method=method, url=url, **req_kwargs) 12 print(response.url, response.content) 13 14 # ##### 发送请求 ##### 15 gevent.joinall([ 16 gevent.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}), 17 gevent.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}), 18 gevent.spawn(fetch_async, method='get', url='https://github.com/', req_kwargs={}), 19 ]) 20 21 # ##### 发送请求(协程池控制最大协程数量) ##### 22 # from gevent.pool import Pool 23 # pool = Pool(None) #none可以写协程个数 24 # gevent.joinall([ 25 # pool.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}), 26 # pool.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}), 27 # pool.spawn(fetch_async, method='get', url='https://www.github.com/', req_kwargs={}), 28 # ])
1 import grequests 2 #将requests和gevent组装起来了的一个模块 3 4 request_list = [ 5 grequests.get('http://httpbin.org/delay/1', timeout=0.001), 6 grequests.get('http://fakedomain/'), 7 grequests.get('http://httpbin.org/status/500') 8 ] 9 10 11 # ##### 执行并获取响应列表 ##### 12 # response_list = grequests.map(request_list) 13 # print(response_list) 14 15 16 # ##### 执行并获取响应列表(处理异常) ##### 17 # def exception_handler(request, exception): 18 # print(request,exception) 19 # print("Request failed") 20 21 # response_list = grequests.map(request_list, exception_handler=exception_handler) 22 # print(response_list)
1 from twisted.web.client import getPage #专门用来发送请求的 2 from twisted.internet import reactor #做事件循环 3 4 REV_COUNTER = 0 5 REQ_COUNTER = 0 6 7 def callback(contents): 8 print(contents,) 9 10 global REV_COUNTER 11 REV_COUNTER += 1 #回来一个结果就执行一次回调函数,这里就+1 12 if REV_COUNTER == REQ_COUNTER: 13 reactor.stop() #等发送的请求个数等于执行的回调函数的个数,说明所有的结果都回来了,这里就能结束这个reactor的 14 15 16 url_list = ['http://www.bing.com', 'http://www.baidu.com', ] 17 REQ_COUNTER = len(url_list) 18 for url in url_list: 19 deferred = getPage(bytes(url, encoding='utf8')) 20 deferred.addCallback(callback) 21 reactor.run() #这就是个循环等待结果
1 from twisted.web.client import getPage 2 from twisted.internet import reactor 3 4 5 class TwistedRequest(object): 6 def __init__(self): 7 self.__req_counter = 0 8 self.__rev_counter = 0 9 10 def __execute(self, content, url, callback): #content就是返回结果 11 if callback: 12 callback(url, content) 13 self.__rev_counter += 1 14 if self.__rev_counter == self.__req_counter: 15 reactor.stop() 16 17 def fetch_url(self, url_callback_list): 18 19 self.__req_counter = len(url_callback_list) #对请求长度进行封装 20 21 for item in url_callback_list: 22 url = item['url'] 23 success_callback = item['success_callback'] 24 error_callback = item['error_callback'] 25 #发送url请求 26 deferred = getPage(bytes(url, encoding='utf8')) 27 deferred.addCallback(self.__execute, url, success_callback) 28 deferred.addErrback(self.__execute, url, error_callback) 29 30 reactor.run()
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 from tornado.httpclient import AsyncHTTPClient 4 from tornado.httpclient import HTTPRequest 5 from tornado import ioloop 6 7 8 def handle_response(response): 9 if response.error: 10 print("Error:", response.error) 11 else: 12 print(response.body) 13 # 方法同twisted 14 # ioloop.IOLoop.current().stop() 这里需要做一个计数器,等所有的结果都回来在停止 15 16 17 def func(): 18 url_list = [ 19 'http://www.cnblogs.com', 20 'http://www.bing.com/test2/', 21 ] 22 for url in url_list: 23 print(url) 24 http_client = AsyncHTTPClient() 25 #HTTPRequest(url)这是封装的请求内容,handle_response回调函数 26 http_client.fetch(HTTPRequest(url), handle_response),发送请求,得到结果并执行回调函数 27 28 #ioloop.IOLoop.current()拿到一个事件循环 29 ioloop.IOLoop.current().add_callback(func) #将函数执行了一遍,但是还没有等到结果 30 ioloop.IOLoop.current().start() #等待结果,一直到有结果返回
tornado可以进行webserver的异步请求,其他框架不行。
以上均是Python内置以及第三方模块提供异步IO请求模块,使用简便大大提高效率,而对于异步IO请求的本质则是【非阻塞Socket】+【IO多路复用】:
1 import select 2 import socket 3 import time 4 5 6 class AsyncTimeoutException(TimeoutError): 7 """ 8 请求超时异常类 9 """ 10 11 def __init__(self, msg): 12 self.msg = msg 13 super(AsyncTimeoutException, self).__init__(msg) 14 15 16 class HttpContext(object): 17 """封装请求和相应的基本数据""" 18 19 def __init__(self, sock, host, port, method, url, data, callback, timeout=5): 20 """ 21 sock: 请求的客户端socket对象 22 host: 请求的主机名 23 port: 请求的端口 24 port: 请求的端口 25 method: 请求方式 26 url: 请求的URL 27 data: 请求时请求体中的数据 28 callback: 请求完成后的回调函数 29 timeout: 请求的超时时间 30 """ 31 self.sock = sock 32 self.callback = callback 33 self.host = host 34 self.port = port 35 self.method = method 36 self.url = url 37 self.data = data 38 39 self.timeout = timeout 40 41 self.__start_time = time.time() 42 self.__buffer = [] 43 44 def is_timeout(self): 45 """当前请求是否已经超时""" 46 current_time = time.time() 47 if (self.__start_time + self.timeout) < current_time: 48 return True 49 50 def fileno(self): 51 """请求sockect对象的文件描述符,用于select监听""" 52 return self.sock.fileno() 53 54 def write(self, data): 55 """在buffer中写入响应内容""" 56 self.__buffer.append(data) 57 58 def finish(self, exc=None): 59 """在buffer中写入响应内容完成,执行请求的回调函数""" 60 if not exc: 61 response = b''.join(self.__buffer) 62 self.callback(self, response, exc) 63 else: 64 self.callback(self, None, exc) 65 66 def send_request_data(self): 67 content = """%s %s HTTP/1.0\r\nHost: %s\r\n\r\n%s""" % ( 68 self.method.upper(), self.url, self.host, self.data,) 69 70 return content.encode(encoding='utf8') 71 72 73 class AsyncRequest(object): 74 def __init__(self): 75 self.fds = [] 76 self.connections = [] 77 78 def add_request(self, host, port, method, url, data, callback, timeout): 79 """创建一个要请求""" 80 client = socket.socket() 81 client.setblocking(False) 82 try: 83 client.connect((host, port)) 84 except BlockingIOError as e: 85 pass 86 # print('已经向远程发送连接的请求') 87 req = HttpContext(client, host, port, method, url, data, callback, timeout) 88 self.connections.append(req) 89 self.fds.append(req) 90 91 def check_conn_timeout(self): 92 """检查所有的请求,是否有已经连接超时,如果有则终止""" 93 timeout_list = [] 94 for context in self.connections: 95 if context.is_timeout(): 96 timeout_list.append(context) 97 for context in timeout_list: 98 context.finish(AsyncTimeoutException('请求超时')) 99 self.fds.remove(context) 100 self.connections.remove(context) 101 102 def running(self): 103 """事件循环,用于检测请求的socket是否已经就绪,从而执行相关操作""" 104 while True: 105 #r表示获得返回数据的socket对象,w代表已经连接上的socket 106 r, w, e = select.select(self.fds, self.connections, self.fds, 0.05) 107 #select.select找的是socket.fileno()这个文件描述符。 108 109 if not self.fds: 110 return 111 112 for context in r: 113 sock = context.sock 114 while True: 115 try: 116 data = sock.recv(8096) 117 if not data: 118 self.fds.remove(context) 119 context.finish() 120 break 121 else: 122 context.write(data) 123 except BlockingIOError as e: 124 break 125 except TimeoutError as e: 126 self.fds.remove(context) 127 self.connections.remove(context) 128 #self.connections.close() 129 #这里应该是close掉,应该告诉服务端要断开连接 130 context.finish(e) 131 break 132 133 for context in w: 134 # 已经连接成功远程服务器,开始向远程发送请求数据 135 if context in self.fds: 136 data = context.send_request_data() 137 context.sock.sendall(data) 138 self.connections.remove(context) 139 140 self.check_conn_timeout() 141 142 143 if __name__ == '__main__': 144 def callback_func(context, response, ex): 145 """ 146 :param context: HttpContext对象,内部封装了请求相关信息 147 :param response: 请求响应内容 148 :param ex: 是否出现异常(如果有异常则值为异常对象;否则值为None) 149 :return: 150 """ 151 print(context, response, ex) 152 153 obj = AsyncRequest() 154 url_list = [ 155 {'host': 'www.google.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5, 156 'callback': callback_func}, 157 {'host': 'www.baidu.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5, 158 'callback': callback_func}, 159 {'host': 'www.bing.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5, 160 'callback': callback_func}, 161 ] 162 for item in url_list: 163 print(item) 164 obj.add_request(**item) 165 166 obj.running()
Scrapy
Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架。 其可以应用在数据挖掘,信息处理或存储历史数据等一系列的程序中。
其最初是为了页面抓取 (更确切来说, 网络抓取 )所设计的, 也可以应用在获取API所返回的数据(例如 Amazon Associates Web Services ) 或者通用的网络爬虫。Scrapy用途广泛,可以用于数据挖掘、监测和自动化测试。
Scrapy 使用了 Twisted异步网络库来处理网络通讯。整体架构大致如下
Scrapy主要包括了以下组件:
- 引擎(Scrapy)
用来处理整个系统的数据流处理, 触发事务(框架核心) - 调度器(Scheduler)
用来接受引擎发过来的请求, 压入队列中, 并在引擎再次请求的时候返回. 可以想像成一个URL(抓取网页的网址或者说是链接)的优先队列, 由它来决定下一个要抓取的网址是什么, 同时去除重复的网址 - 下载器(Downloader)
用于下载网页内容, 并将网页内容返回给蜘蛛(Scrapy下载器是建立在twisted这个高效的异步模型上的) - 爬虫(Spiders)
爬虫是主要干活的, 用于从特定的网页中提取自己需要的信息, 即所谓的实体(Item)。用户也可以从中提取出链接,让Scrapy继续抓取下一个页面 - 项目管道(Pipeline)
负责处理爬虫从网页中抽取的实体,主要的功能是持久化实体、验证实体的有效性、清除不需要的信息。当页面被爬虫解析后,将被发送到项目管道,并经过几个特定的次序处理数据。 - 下载器中间件(Downloader Middlewares)
位于Scrapy引擎和下载器之间的框架,主要是处理Scrapy引擎与下载器之间的请求及响应。 - 爬虫中间件(Spider Middlewares)
介于Scrapy引擎和爬虫之间的框架,主要工作是处理蜘蛛的响应输入和请求输出。 - 调度中间件(Scheduler Middewares)
介于Scrapy引擎和调度之间的中间件,从Scrapy引擎发送到调度的请求和响应。
Scrapy运行流程大概如下:
- 引擎从调度器中取出一个链接(URL)用于接下来的抓取
- 引擎把URL封装成一个请求(Request)传给下载器
- 下载器把资源下载下来,并封装成应答包(Response)
- 爬虫解析Response
- 解析出实体(Item),则交给实体管道进行进一步的处理
- 解析出的是链接(URL),则把URL交给调度器等待抓取
一、安装
1 Linux 2 pip3 install scrapy 3 4 5 Windows 6 a. pip3 install wheel 7 b. 下载twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted 8 c. 进入下载目录,执行 pip3 install Twisted‑17.1.0‑cp35‑cp35m‑win_amd64.whl 9 d. pip3 install scrapy 10 e. 下载并安装pywin32:https://sourceforge.net/projects/pywin32/files/
二、基本使用
1. 基本命令
1 1. scrapy startproject 项目名称 2 - 在当前目录中创建中创建一个项目文件(类似于Django) 3 4 2. scrapy genspider [-t template] <name> <domain> 5 - 创建爬虫应用 6 如: 7 scrapy gensipider -t basic oldboy oldboy.com 8 scrapy gensipider -t xmlfeed autohome autohome.com.cn 9 PS: 10 查看所有命令:scrapy gensipider -l 11 查看模板命令:scrapy gensipider -d 模板名称 12 13 3. scrapy list 14 - 展示爬虫应用列表
15 16 4. scrapy crawl 爬虫应用名称 17 - 运行单独爬虫应用
2.项目结构以及爬虫应用简介
1 project_name/ 2 scrapy.cfg #主配制文件 3 project_name/ 4 __init__.py 5 items.py #item.py和pipelines.py 是做格式化和持久化用的 6 pipelines.py 7 settings.py 8 spiders/ 9 __init__.py 10 爬虫1.py 11 爬虫2.py 12 爬虫3.py
文件说明:
- scrapy.cfg 项目的主配置信息。(真正爬虫相关的配置信息在settings.py文件中)
- items.py 设置数据存储模板,用于结构化数据,如:Django的Model
- pipelines 数据处理行为,如:一般结构化的数据持久化
- settings.py 配置文件,如:递归的层数、并发数,延迟下载等
- spiders 爬虫目录,如:创建文件,编写爬虫规则
注意:一般创建爬虫文件时,以网站域名命名
在执行了
scrapy gensipider
-t basic xiaohuar xiaohuar.com之后,会在项目里面的spider文件夹中生成一个名为xiaohuar.py的文件,里面的内容如下:
这就是爬虫了:
1 import scrapy 2 3 class XiaoHuarSpider(scrapy.spiders.Spider): 4 name = "xiaohuar" # 爬虫名称 ***** 必须存在 5 allowed_domains = ["xiaohuar.com"] # 允许的域名(只爬这一个域名下的网站) 6 start_urls = [ 7 "http://www.xiaohuar.com/hua/", # 起始URL(自动生成的是"http://www.xiaohuar.com/")
8 ] 9 10 def parse(self, response): 11 # 访问起始URL并获取结果后的回调函数,有结果之后会自动执行这个回调函数,执行内容自制定
3. 小试牛刀
import scrapy from scrapy.selector import HtmlXPathSelector from scrapy.http.request import Request class DigSpider(scrapy.Spider): # 爬虫应用的名称,通过此名称启动爬虫命令 name = "dig" # 允许的域名 allowed_domains = ["chouti.com"] # 起始URL start_urls = [ 'http://dig.chouti.com/', ] has_request_set = {} def parse(self, response): print(response.url) hxs = HtmlXPathSelector(response) page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/\d+")]/@href').extract() for page in page_list: page_url = 'http://dig.chouti.com%s' % page key = self.md5(page_url) if key in self.has_request_set: pass else: self.has_request_set[key] = page_url obj = Request(url=page_url, method='GET', callback=self.parse) yield obj @staticmethod def md5(val): import hashlib ha = hashlib.md5() ha.update(bytes(val, encoding='utf-8')) key = ha.hexdigest() return key
有些网站是禁止爬虫的,但是在settings配制文件中修改一个配制就可以了:
执行此爬虫文件,则在终端进入项目目录执行如下命令:
1 scrapy crawl dig --nolog
对于上述代码重要之处在于:
- Request是一个封装用户请求的类,在回调函数中yield该对象表示继续访问
- HtmlXpathSelector用于结构化HTML代码并提供选择器功能
4. 选择器
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 from scrapy.selector import Selector, HtmlXPathSelector 4 from scrapy.http import HtmlResponse 5 html = """<!DOCTYPE html> 6 <html> 7 <head lang="en"> 8 <meta charset="UTF-8"> 9 <title></title> 10 </head> 11 <body> 12 <ul> 13 <li class="item-"><a id='i1' href="link.html">first item</a></li> 14 <li class="item-0"><a id='i2' href="llink.html">first item</a></li> 15 <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li> 16 </ul> 17 <div><a href="llink2.html">second item</a></div> 18 </body> 19 </html> 20 """ 21 response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8') 22 # hxs = HtmlXPathSelector(response) 23 # print(hxs) 24 # hxs = Selector(response=response).xpath('//a') 25 # print(hxs) 26 # hxs = Selector(response=response).xpath('//a[2]') 27 # print(hxs) 28 # hxs = Selector(response=response).xpath('//a[@id]') 29 # print(hxs) 30 # hxs = Selector(response=response).xpath('//a[@id="i1"]') 31 # print(hxs) 32 # hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]') 33 # print(hxs) 34 # hxs = Selector(response=response).xpath('//a[contains(@href, "link")]') 35 # print(hxs) 36 # hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]') 37 # print(hxs) 38 # hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]') 39 # print(hxs) 40 # hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract() 41 # print(hxs) 42 # hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract() 43 # print(hxs) 44 # hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract() 45 # print(hxs) 46 # hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first() 47 # print(hxs) 48 49 # ul_list = Selector(response=response).xpath('//body/ul/li') 50 # for item in ul_list: 51 # v = item.xpath('./a/span') 52 # # 或 53 # # v = item.xpath('a/span') 54 # # 或 55 # # v = item.xpath('*/a/span') 56 # print(v)
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.selector import HtmlXPathSelector 4 from scrapy.http.request import Request 5 from scrapy.http.cookies import CookieJar 6 from scrapy import FormRequest 7 8 9 class ChouTiSpider(scrapy.Spider): 10 # 爬虫应用的名称,通过此名称启动爬虫命令 11 name = "chouti" 12 # 允许的域名 13 allowed_domains = ["chouti.com"] 14 15 cookie_dict = {} 16 has_request_set = {} 17 18 def start_requests(self): 19 url = 'http://dig.chouti.com/' 20 # return [Request(url=url, callback=self.login)] 21 yield Request(url=url, callback=self.login) 22 23 def login(self, response): 24 cookie_jar = CookieJar() 25 cookie_jar.extract_cookies(response, response.request) 26 for k, v in cookie_jar._cookies.items(): 27 for i, j in v.items(): 28 for m, n in j.items(): 29 self.cookie_dict[m] = n.value 30 31 req = Request( 32 url='http://dig.chouti.com/login', 33 method='POST', 34 headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}, 35 body='phone=8615131255089&password=pppppppp&oneMonth=1', 36 cookies=self.cookie_dict, 37 callback=self.check_login 38 ) 39 yield req 40 41 def check_login(self, response): 42 req = Request( 43 url='http://dig.chouti.com/', 44 method='GET', 45 callback=self.show, 46 cookies=self.cookie_dict, 47 dont_filter=True 48 ) 49 yield req 50 51 def show(self, response): 52 # print(response) 53 hxs = HtmlXPathSelector(response) 54 news_list = hxs.select('//div[@id="content-list"]/div[@class="item"]') 55 for new in news_list: 56 # temp = new.xpath('div/div[@class="part2"]/@share-linkid').extract() 57 link_id = new.xpath('*/div[@class="part2"]/@share-linkid').extract_first() 58 yield Request( 59 url='http://dig.chouti.com/link/vote?linksId=%s' %(link_id,), 60 method='POST', 61 cookies=self.cookie_dict, 62 callback=self.do_favor 63 ) 64 65 page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/\d+")]/@href').extract() 66 for page in page_list: 67 68 page_url = 'http://dig.chouti.com%s' % page 69 import hashlib 70 hash = hashlib.md5() 71 hash.update(bytes(page_url,encoding='utf-8')) 72 key = hash.hexdigest() 73 if key in self.has_request_set: 74 pass 75 else: 76 self.has_request_set[key] = page_url 77 yield Request( 78 url=page_url, 79 method='GET', 80 callback=self.show 81 ) 82 83 def do_favor(self, response): 84 print(response.text)
注意:settings.py中设置DEPTH_LIMIT = 1来指定“递归”的层数。
5. 格式化处理
上述实例只是简单的处理,所以在parse方法中直接处理。如果对于想要获取更多的数据处理,则可以利用Scrapy的items将数据格式化,然后统一交由pipelines来处理。
1 import scrapy 2 from scrapy.selector import HtmlXPathSelector 3 from scrapy.http.request import Request 4 from scrapy.http.cookies import CookieJar 5 from scrapy import FormRequest 6 7 8 class XiaoHuarSpider(scrapy.Spider): 9 # 爬虫应用的名称,通过此名称启动爬虫命令 10 name = "xiaohuar" 11 # 允许的域名 12 allowed_domains = ["xiaohuar.com"] 13 14 start_urls = [ 15 "http://www.xiaohuar.com/list-1-1.html", 16 ] 17 # custom_settings = { 18 # 'ITEM_PIPELINES':{ 19 # 'spider1.pipelines.JsonPipeline': 100 20 # } 21 # } 22 has_request_set = {} 23 24 def parse(self, response): 25 # 分析页面 26 # 找到页面中符合规则的内容(校花图片),保存 27 # 找到所有的a标签,再访问其他a标签,一层一层的搞下去 28 29 hxs = HtmlXPathSelector(response) 30 31 items = hxs.select('//div[@class="item_list infinite_scroll"]/div') 32 for item in items: 33 src = item.select('.//div[@class="img"]/a/img/@src').extract_first() 34 name = item.select('.//div[@class="img"]/span/text()').extract_first() 35 school = item.select('.//div[@class="img"]/div[@class="btns"]/a/text()').extract_first() 36 url = "http://www.xiaohuar.com%s" % src 37 from ..items import XiaoHuarItem 38 obj = XiaoHuarItem(name=name, school=school, url=url) 39 yield obj 40 41 urls = hxs.select('//a[re:test(@href, "http://www.xiaohuar.com/list-1-\d+.html")]/@href') 42 for url in urls: 43 key = self.md5(url) 44 if key in self.has_request_set: 45 pass 46 else: 47 self.has_request_set[key] = url 48 req = Request(url=url,method='GET',callback=self.parse) 49 yield req 50 51 @staticmethod 52 def md5(val): 53 import hashlib 54 ha = hashlib.md5() 55 ha.update(bytes(val, encoding='utf-8')) 56 key = ha.hexdigest() 57 return key
1 import scrapy 2 3 4 class XiaoHuarItem(scrapy.Item): 5 name = scrapy.Field() 6 school = scrapy.Field() 7 url = scrapy.Field()
1 import json 2 import os 3 import requests 4 5 6 class JsonPipeline(object): 7 def __init__(self): 8 self.file = open('xiaohua.txt', 'w') 9 10 def process_item(self, item, spider): 11 v = json.dumps(dict(item), ensure_ascii=False) 12 self.file.write(v) 13 self.file.write('\n') 14 self.file.flush() 15 return item 16 17 18 class FilePipeline(object): 19 def __init__(self): 20 if not os.path.exists('imgs'): 21 os.makedirs('imgs') 22 23 def process_item(self, item, spider): 24 response = requests.get(item['url'], stream=True) 25 file_name = '%s_%s.jpg' % (item['name'], item['school']) 26 with open(os.path.join('imgs', file_name), mode='wb') as f: 27 f.write(response.content) 28 return item
1 ITEM_PIPELINES = { 2 'spider1.pipelines.JsonPipeline': 100, 3 'spider1.pipelines.FilePipeline': 300, 4 } 5 # 每行后面的整型值,确定了他们运行的顺序,item按数字从低到高的顺序,通过pipeline,通常将这些数字定义在0-1000范围内。
6.中间件
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your spider middleware 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 8 9 class CustomSpiderMiddleware(object): 10 # Not all methods need to be defined. If a method is not defined, 11 # scrapy acts as if the spider middleware does not modify the 12 # passed objects. 13 14 def process_spider_input(self, response, spider): 15 # Called for each response that goes through the spider 16 # middleware and into the spider. 17 18 # Should return None or raise an exception. 19 print('process_spider_input', len(response.text)) 20 return None 21 22 def process_spider_output(self, response, result, spider): 23 # Called with the results returned from the Spider, after 24 # it has processed the response. 25 print('process_spider_output', len(response.text)) 26 # Must return an iterable of Request, dict or Item objects. 27 for i in result: 28 yield i 29 30 def process_spider_exception(self, response, exception, spider): 31 # Called when a spider or process_spider_input() method 32 # (from other spider middleware) raises an exception. 33 34 # Should return either None or an iterable of Response, dict 35 # or Item objects. 36 print('process_spider_exception') 37 pass 38 39 def process_start_requests(self, start_requests, spider): 40 # Called with the start requests of the spider, and works 41 # similarly to the process_spider_output() method, except 42 # that it doesn’t have a response associated. 43 44 # Must return only requests (not items). 45 print('process_start_requests') 46 for r in start_requests: 47 yield r 48 49 def spider_opened(self, spider): 50 spider.logger.info('Spider opened: %s' % spider.name) 51 52 53 class CustomDownloaderMiddleware(object): 54 def process_request(self, request, spider): 55 return None 56 57 def process_response(self, request, response, spider): 58 return response 59 60 def process_exception(self, request, exception, spider): 61 return None
1 # settings.py 2 3 DOWNLOADER_MIDDLEWARES = { 4 'spider1.middlewares.CustomDownloaderMiddleware': 543, 5 } 6 SPIDER_MIDDLEWARES = { 7 'spider1.middlewares.CustomSpiderMiddleware': 543, 8 }
7. 自定制命令
- 在spiders同级创建任意目录,如:commands
- 在其中创建 crawlall.py 文件 (此处文件名就是自定义的命令)
1 from scrapy.commands import ScrapyCommand 2 from scrapy.utils.project import get_project_settings 3 4 5 class Command(ScrapyCommand): 6 7 requires_project = True 8 9 def syntax(self): 10 return '[options]' 11 12 def short_desc(self): 13 return 'Runs all of the spiders' 14 15 def run(self, args, opts): 16 spider_list = self.crawler_process.spiders.list() 17 for name in spider_list: 18 self.crawler_process.crawl(name, **opts.__dict__) 19 self.crawler_process.start()
- 在settings.py 中添加配置 COMMANDS_MODULE = '项目名称.目录名称'
- 在项目目录执行命令:scrapy crawlall
更多文档参见:http://scrapy-chs.readthedocs.io/zh_CN/latest/index.html