性能方面的初出茅庐

知识点:进程、线程、协程、IO多路
模块:asyncio

思考:在编写爬虫时,性能的消耗主要在IO请求中,当单进程单线程模式下请求URL时必然会引起等待,从而使得请求整体变慢。

import requests
def fetch_async(url):
    response=requests.get(url)

    return response
url_list=[
    'http://www.baidu.com',
    'http://www.cnblogs.com',
    'http://www.sina.com',
    'http://www.bing.com',
    'http://www.qq.com',
    'https://www.taobao.com/',
]
for url in url_list:
    res=fetch_async(url)
    print(res)
同步执行 resquests 
import requests
from concurrent.futures import ThreadPoolExecutor
def fetch_async(url):
    response=requests.get(url)

    return response
url_list=[
    'http://www.baidu.com',
    'http://www.cnblogs.com',
    'http://www.sina.com',
    'http://www.bing.com',
    'http://www.qq.com',
    'https://www.taobao.com/',
]
pool=ThreadPoolExecutor(5)
for url in url_list:
    res=pool.submit(fetch_async,url)
    print(res)
pool.shutdown(wait=True)
多线程
import requests
from concurrent.futures import ThreadPoolExecutor
def fetch_async(url):
    response=requests.get(url)

    return response
def callback(future):
    print(future.result().text)
url_list=[
    'http://www.baidu.com',
    'http://www.cnblogs.com',
    'http://www.sina.com',
    'http://www.bing.com',
    'http://www.qq.com',
    'https://www.taobao.com/',
]
pool=ThreadPoolExecutor(5)
for url in url_list:
    res=pool.submit(fetch_async,url)
    res.add_done_callback(callback)
pool.shutdown(wait=True)
多线程+回调函数执行
from concurrent.futures import ProcessPoolExecutor
import myrequests
def fetch_async(url):
    res=requests.get(url)
    return res

url_list=[
    'http://www.baidu.com',
    'http://www.cnblogs.com',
    'http://www.sina.com',
    'http://www.bing.com',
    'http://www.qq.com',
    'https://www.taobao.com/',
]
if __name__ == '__main__':
    pool=ProcessPoolExecutor(5)
    for url in url_list:
        pool.submit(fetch_async,url)
    pool.shutdown(wait=True)
多进程
from concurrent.futures import ProcessPoolExecutor
import myrequests
def fetch_async(url):
    res=requests.get(url)
    return res
def callback(future):
    print(future.result())

url_list=[
    'http://www.baidu.com',
    'http://www.cnblogs.com',
    'http://www.sina.com',
    'http://www.bing.com',
    'http://www.qq.com',
    'https://www.taobao.com/',
]
if __name__ == '__main__':
    pool=ProcessPoolExecutor(5)
    for url in url_list:
        res=pool.submit(fetch_async,url)
        res=res.add_done_callback(res)
    pool.shutdown(wait=True)
多进程+回调函数执行

通过上述代码均可以完成对请求性能的提高,对于多线程和多进行的缺点是在IO阻塞时会造成了线程和进程的浪费,所以异步IO回事首选:

import asyncio
@asyncio.coroutine
def fun1():
    print('before...func1......')
    yield from asyncio.sleep(5)
    print('end...func1......')
task =[fun1(),fun1()]

loop=asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*task))
loop.close()
asyncio 1
import asyncio
@asyncio.coroutine
def fun(host,url='/'):
    print(host, url)
    reader,writer=yield from asyncio.open_connection(host,80)
    # 创建连接
    request_header_content="GET %s HTTP/1.0\r\nHost: %s\r\n\r\n"%(url,host,)
    # 请求格式
    writer.write(request_header_content)
    # 发送请求
    yield from writer.drain()
    # 执行发送
    text=yield from reader.read()
    # 获得返回值
    print(host,url,text)
    writer.close()

tasks = [
    # fun('host','/url')
    fun('www.cnblogs.com', '/index/'),
    fun('dig.chouti.com', '/pic/show?nid=4073644713430508&lid=10273091')
]

loop=asyncio.get_event_loop()

loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
asyncio 2
import asyncio
import aiohttp
@asyncio.coroutine
def fun(url):
    print(url)
    response=yield from aiohttp.request('GET',url)

    response.close()

tasks = [
    # fun('host','/url')
    fun('http://www.baidu.com'),
    fun('http://dig.chouti.com')
]
if __name__ == '__main__':

    loop=asyncio.get_event_loop()
    results=loop.run_until_complete(asyncio.gather(*tasks))
    loop.close()
asynico +aio 模块
import asyncio
import requests
@asyncio.coroutine
def fun(func, *args):
    loop = asyncio.get_event_loop()
    future = loop.run_in_executor(None, func, *args)
    response = yield from future
    print(response.url, response.content)


tasks = [
    fun(requests.get,'http://www.baidu.com'),
    fun(requests.get,'http://dig.chouti.com')
]
loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
asyncio + requests
import grequests


request_list = [
    grequests.get('http://httpbin.org/delay/1', timeout=0.001),
    grequests.get('http://fakedomain/'),
    grequests.get('http://httpbin.org/status/500')
]


# ##### 执行并获取响应列表 #####
# response_list = grequests.map(request_list)
# print(response_list)


# ##### 执行并获取响应列表(处理异常) #####
# def exception_handler(request, exception):
# print(request,exception)
#     print("Request failed")

# response_list = grequests.map(request_list, exception_handler=exception_handler)
# print(response_list)

5.grequests
grequests《----gevent+requests 
import gevent,requests
from gevent import monkey

def fun(method,url,req_kwargs):
    print(method, url, req_kwargs)
    res=requests.request(method=method,url=url,**req_kwargs)
    print(res.url, res.content)
# ##### 发送请求 #####

# gevent.joinall([
#     gevent.spawn(fun,method='get',url='https://www.python.org/', req_kwargs={}),
#     gevent.spawn(fun, method='get', url='https://www.yahoo.com/', req_kwargs={}),
#     gevent.spawn(fun, method='get', url='https://github.com/', req_kwargs={}),
# ])
# ##### 发送请求(协程池控制最大协程数量) #####
from gevent.pool import Pool
pool = Pool(4)
gevent.joinall([
    pool.spawn(fun, method='get', url='https://www.python.org/', req_kwargs={}),
    pool.spawn(fun, method='get', url='https://www.baidu.com', req_kwargs={}),
    pool.spawn(fun, method='get', url='www.cnblogs.com', req_kwargs={}),
])
gevent + requests
from twisted.internet import defer
from twisted.web.client import getPage
from twisted.internet import reactor

def one_done(arg):
    print(arg)


@defer.inlineCallbacks
def task(url):
    res=getPage(bytes(url,encoding='utf-8'))  # 发送Http请求

    # 获取页面
    res.addCallback(one_done)
    # 添加一个回掉函数
    yield res
def all_done(arg):
    print('done')
    reactor.stop()
url_list=[
    'http://www.cnblogs.com',
    'http://www.cnblogs.com',
    'http://www.cnblogs.com',
    'http://www.cnblogs.com',
]
defer_list = [] # [特殊,特殊,特殊(已经向url发送请求)]
for url in url_list:
    v = task(url)
    # 由于@defer.inlineCallbacks,请求完成马上执下一个请求 没有阻塞
    defer_list.append(v)
d=defer.DeferredList(defer_list)
''' 内部存在计数器所有完成向下进行'''

d.addBoth(all_done)


reactor.run()   # 死循环
Twisted
from tornado.httpclient import AsyncHTTPClient
from tornado.httpclient import HTTPRequest
from tornado import ioloop

COUNT = 0
def handle_response(response):
    global COUNT
    COUNT -= 1
    if response.error:
        print("Error:", response.error)
    else:
        print(response.body)
        # 方法同twisted
        # ioloop.IOLoop.current().stop()
    if COUNT == 0:
        ioloop.IOLoop.current().stop()

def func():
    url_list = [
        'http://www.cnblogs.com',
        'http://www.baidu.com',
    ]
    global COUNT
    COUNT = len(url_list)
    for url in url_list:
        print(url)
        http_client = AsyncHTTPClient()
        http_client.fetch(HTTPRequest(url), handle_response)

#
ioloop.IOLoop.current().add_callback(func)
ioloop.IOLoop.current().start()  #死循环
Tornado

 

以上均是Python内置以及第三方模块提供异步IO请求模块,使用简便大大提高效率,而对于异步IO请求的本质则是【非阻塞Socket】+【IO多路复用】:

import socket
import select


# sk=socket.socket()
# sk.connect(('www.baidu.com',80,))
# sk.send(b'GET / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\n')
# sk.send(b'POST / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\nk1=v1&k2=v2')
#
# data = sk.recv(8096) # IO阻塞
# print(data)
# sk.close()

# sk = socket.socket()
# sk.setblocking(False)
# # 1.连接
# try:
#     sk.connect(('www.baidu.com',80,)) # IO阻塞
#     print('连接成功了...')
# except BlockingIOError as e:
#     print(e)
# # 2. 连接成功发送消息
# sk.send(b'GET / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\n')
# # sk.send(b'POST / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\nk1=v1&k2=v2')
#
# # 3. 等待着服务端响应
# data = sk.recv(8096) # IO阻塞
# print(data)
#
# # 关闭连接
# sk.close()

class HttpRequest:
    def __init__(self,sk,host,callback):
        self.socket=sk
        self.host=host
        self.callback=callback

    def fileno(self):
        # 必须有一个文件描述符,
        return self.socket.fileno()



class HttpResponse:
    def __init__(self,recv_data):
        self.recv_data = recv_data
        self.header_dict = {}
        self.body = None

        self.initialize()

    def initialize(self):
        headers, body = self.recv_data.split(b'\r\n\r\n', 1)
        self.body = body
        header_list = headers.split(b'\r\n')
        for h in header_list:
            h_str = str(h, encoding='utf-8')
            v = h_str.split(':', 1)
            if len(v) == 2:
                self.header_dict[v[0]] = v[1]


class AsyncRequest:
    def __init__(self):
        self.conn = []
        self.connection = [] # 用于检测是否已经连接成功

    def add_request(self,host,callback):
        try:
            sk = socket.socket()
            sk.setblocking(0)
            sk.connect((host, 80,))
        except BlockingIOError as e:
            print("报错了亲",e)
        request=HttpRequest(sk,host,callback)
        self.conn.append(request)
        self.connection.append(request)

    def run(self):

        while True:
            rlist,wlist,elist = select.select(self.conn,self.connection,self.conn,0.05)
            for w in wlist:
                print(w.host,'连接成功...')
                # 只要能循环到,表示socket和服务器端已经连接成功
                tpl = "GET / HTTP/1.0\r\nHost:%s\r\n\r\n"  %(w.host,)
                w.socket.send(bytes(tpl,encoding='utf-8'))
                self.connection.remove(w)
            for r in rlist:
                # r,是HttpRequest
                recv_data = bytes()
                while True:
                    try:
                        chunck = r.socket.recv(8096)
                        recv_data += chunck
                    except Exception as e:
                        break
                response = HttpResponse(recv_data)
                r.callback(response)
                r.socket.close()
                self.conn.remove(r)
            if len(self.conn) == 0:
                break

def f1(response):
    print('保存到文件',response.header_dict)

def f2(response):
    print('保存到数据库', response.header_dict)

url_list = [
    {'host':'www.baidu.com','callback': f1},
    {'host':'cn.bing.com','callback': f2},
    {'host':'www.cnblogs.com','callback': f2},
]

req = AsyncRequest()
for item in url_list:
    req.add_request(item['host'],item['callback'])

req.run()
自定制

 

posted @ 2017-06-07 20:18  红领巾下的大刀疤  阅读(233)  评论(0编辑  收藏  举报
/* 看板娘 */