爬虫常用的其他模块

gevent模块


"""
gevent 内部就是使用的生成器实现

"""
import gevent
from gevent import monkey;monkey.patch_all()
import requests

def parse_data(resp):
    print("parse data:",len(resp))

urls = ["https://www.python.org","https://www.qq.com","https://www.taobao.com"]

def get_page(url,call_back):
    print("get : ",url)
    res = requests.get(url)
    # return res.text
    call_back(res.text)


# ts = []
# for i in urls:
#     g = gevent.spawn(get_page,i,parse_data)
#     ts.append(g)
#
# gevent.joinall(ts)
#
# print("over")

# 协程池
from gevent.pool import Pool
pool=Pool(100)
g1=pool.spawn(get_page,'https://www.python.org/doc',parse_data)
g2=pool.spawn(get_page,'https://www.cnblogs.com/linhaifeng',parse_data)
g3=pool.spawn(get_page,'https://www.openstack.org',parse_data)
gevent.joinall([g1,g2,g3,])
# print(g1.value,g2.value,g3.value) #拿到返回值

asyncio实现简单的爬虫

import asyncio
"""
    asyncio 是处理IO操作的  所以 我们需要自己处理读和写(http协议)
    在建立连接时 如果需要加密链接 则需要依赖pyopenssl模块  
"""
user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'

@asyncio.coroutine
def request(method,host,port,ssl=False):
    print("get",host)
    if ssl:
        port = 443
    reader,writer = yield from asyncio.open_connection(host=host,port=port,ssl=ssl)

    # 发送请求头数据
    request_headers = """%s / HTTP/1.0\r\nHost: %s\r\nUser-agent: %s\r\n\r\n""" % (method.upper(),host, user_agent)
    print(request_headers)
    # 这是把数据交给操作系统
    writer.write(request_headers.encode("utf-8"))
    #  真正的发送数据
    yield from writer.drain()

    # 接收服务器返回的数据
    # 响应头
    while True:
        line = yield  from reader.readline()
        if line == b"\r\n":
            break
        # print(host,"headers: ",line)
    # 响应体
    body = yield from reader.read()
    # print(body)
    writer.close() # 关闭writer 对方会自动关闭

loop = asyncio.get_event_loop()
urls = ["www.python.org","www.qq.com","www.taobao.com"]
ts = [request("get",i,80,True) for i in urls]
loop.run_until_complete(asyncio.wait(ts))
loop.close()


asyncio+requests

import requests,asyncio
"""
"""
@asyncio.coroutine
def task(url):
    print(url)
    # 获取loop
    loop = asyncio.get_event_loop()
    res = yield from loop.run_in_executor(None,requests.get,url)
    # res = requests.get(url)
    print(res.text)

loop = asyncio.get_event_loop()
urls = ["https://www.python.org","https://www.qq.com","https://www.taobao.com"]
ts = [task(i) for i in urls]
loop.run_until_complete(asyncio.wait(ts))
loop.close()



grequests模块

import grequests



urls = ["https://www.python.org","https://www.qq.com","https://www.taobao.com"]

# ts = []
# for i in urls:
#     res = grequests.request("get", i)
#
#     print(res)

ts = [grequests.request("get",i) for i in urls]
res = grequests.map(ts)
print(res)

tornado模块


from tornado.httpclient import AsyncHTTPClient
from tornado.httpclient import HTTPRequest
from tornado import ioloop

count=0

def handle_response(response):
    """
    处理返回值内容（需要维护计数器，来停止IO循环），调用 ioloop.IOLoop.current().stop()
    :param response:
    :return:
    """
    if response.error:
        print("Error:", response.error)
    else:
        print(len(response.body))

    global count
    count-=1 #完成一次回调，计数减1
    if count == 0:
        ioloop.IOLoop.current().stop()

def func():
    url_list = [
        'http://www.baidu.com',
        'http://www.bing.com',
    ]

    global count
    for url in url_list:
        print(url)
        http_client = AsyncHTTPClient()
        http_client.fetch(HTTPRequest(url), handle_response)
        count+=1 #计数加1

ioloop.IOLoop.current().add_callback(func)
ioloop.IOLoop.current().start()




twisted模块

from twisted.web.client import getPage,defer
from twisted.internet import reactor

def all_done(arg):
    reactor.stop()

def callback(res):
    print(res)
    return 1

defer_list=[]
urls=[
    'http://www.baidu.com',
    'http://www.bing.com',
    'https://www.python.org',
]
for url in urls:
    print(url)
    obj=getPage(url.encode('utf=-8'),)
    obj.addCallback(callback)
    defer_list.append(obj)

# defer_list任务列表  addBoth设置任务完成的回调
defer.DeferredList(defer_list).addBoth(all_done)
reactor.run()

posted @ 2019-01-25 15:18 不沉之月阅读(257) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

不沉之月

爬虫常用的其他模块

公告