爬虫常用的其他模块

gevent模块


"""
gevent 内部就是使用的生成器实现

"""
import gevent
from gevent import monkey;monkey.patch_all()
import requests

def parse_data(resp):
print("parse data:",len(resp))

urls = ["https://www.python.org","https://www.qq.com","https://www.taobao.com"]

def get_page(url,call_back):
print("get : ",url)
res = requests.get(url)
# return res.text
call_back(res.text)


# ts = []
# for i in urls:
# g = gevent.spawn(get_page,i,parse_data)
# ts.append(g)
#
# gevent.joinall(ts)
#
# print("over")

# 协程池
from gevent.pool import Pool
pool=Pool(100)
g1=pool.spawn(get_page,'https://www.python.org/doc',parse_data)
g2=pool.spawn(get_page,'https://www.cnblogs.com/linhaifeng',parse_data)
g3=pool.spawn(get_page,'https://www.openstack.org',parse_data)
gevent.joinall([g1,g2,g3,])
# print(g1.value,g2.value,g3.value) #拿到返回值

asyncio实现简单的爬虫
import asyncio
"""
asyncio 是处理IO操作的 所以 我们需要自己处理读和写(http协议)
在建立连接时 如果需要加密链接 则需要依赖pyopenssl模块
"""
user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'

@asyncio.coroutine
def request(method,host,port,ssl=False):
print("get",host)
if ssl:
port = 443
reader,writer = yield from asyncio.open_connection(host=host,port=port,ssl=ssl)

# 发送请求头数据
request_headers = """%s / HTTP/1.0\r\nHost: %s\r\nUser-agent: %s\r\n\r\n""" % (method.upper(),host, user_agent)
print(request_headers)
# 这是把数据交给操作系统
writer.write(request_headers.encode("utf-8"))
# 真正的发送数据
yield from writer.drain()

# 接收服务器返回的数据
# 响应头
while True:
line = yield from reader.readline()
if line == b"\r\n":
break
# print(host,"headers: ",line)
# 响应体
body = yield from reader.read()
# print(body)
writer.close() # 关闭writer 对方会自动关闭

loop = asyncio.get_event_loop()
urls = ["www.python.org","www.qq.com","www.taobao.com"]
ts = [request("get",i,80,True) for i in urls]
loop.run_until_complete(asyncio.wait(ts))
loop.close()


asyncio+requests
import requests,asyncio
"""
"""
@asyncio.coroutine
def task(url):
print(url)
# 获取loop
loop = asyncio.get_event_loop()
res = yield from loop.run_in_executor(None,requests.get,url)
# res = requests.get(url)
print(res.text)

loop = asyncio.get_event_loop()
urls = ["https://www.python.org","https://www.qq.com","https://www.taobao.com"]
ts = [task(i) for i in urls]
loop.run_until_complete(asyncio.wait(ts))
loop.close()



grequests模块
import grequests



urls = ["https://www.python.org","https://www.qq.com","https://www.taobao.com"]

# ts = []
# for i in urls:
# res = grequests.request("get", i)
#
# print(res)

ts = [grequests.request("get",i) for i in urls]
res = grequests.map(ts)
print(res)
 
tornado模块

from tornado.httpclient import AsyncHTTPClient
from tornado.httpclient import HTTPRequest
from tornado import ioloop

count=0

def handle_response(response):
"""
处理返回值内容(需要维护计数器,来停止IO循环),调用 ioloop.IOLoop.current().stop()
:param response:
:return:
"""
if response.error:
print("Error:", response.error)
else:
print(len(response.body))

global count
count-=1 #完成一次回调,计数减1
if count == 0:
ioloop.IOLoop.current().stop()

def func():
url_list = [
'http://www.baidu.com',
'http://www.bing.com',
]

global count
for url in url_list:
print(url)
http_client = AsyncHTTPClient()
http_client.fetch(HTTPRequest(url), handle_response)
count+=1 #计数加1

ioloop.IOLoop.current().add_callback(func)
ioloop.IOLoop.current().start()




twisted模块
from twisted.web.client import getPage,defer
from twisted.internet import reactor

def all_done(arg):
reactor.stop()

def callback(res):
print(res)
return 1

defer_list=[]
urls=[
'http://www.baidu.com',
'http://www.bing.com',
'https://www.python.org',
]
for url in urls:
print(url)
obj=getPage(url.encode('utf=-8'),)
obj.addCallback(callback)
defer_list.append(obj)

# defer_list任务列表 addBoth设置任务完成的回调
defer.DeferredList(defer_list).addBoth(all_done)
reactor.run()






 
 
posted @ 2019-01-25 15:18  不沉之月  阅读(257)  评论(0编辑  收藏  举报