爬虫常用的其他模块
gevent模块
"""
gevent 内部就是使用的生成器实现
"""
import gevent
from gevent import monkey;monkey.patch_all()
import requests
def parse_data(resp):
print("parse data:",len(resp))
urls = ["https://www.python.org","https://www.qq.com","https://www.taobao.com"]
def get_page(url,call_back):
print("get : ",url)
res = requests.get(url)
# return res.text
call_back(res.text)
# ts = []
# for i in urls:
# g = gevent.spawn(get_page,i,parse_data)
# ts.append(g)
#
# gevent.joinall(ts)
#
# print("over")
# 协程池
from gevent.pool import Pool
pool=Pool(100)
g1=pool.spawn(get_page,'https://www.python.org/doc',parse_data)
g2=pool.spawn(get_page,'https://www.cnblogs.com/linhaifeng',parse_data)
g3=pool.spawn(get_page,'https://www.openstack.org',parse_data)
gevent.joinall([g1,g2,g3,])
# print(g1.value,g2.value,g3.value) #拿到返回值
asyncio实现简单的爬虫
import asyncio
"""
asyncio 是处理IO操作的 所以 我们需要自己处理读和写(http协议)
在建立连接时 如果需要加密链接 则需要依赖pyopenssl模块
"""
user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
@asyncio.coroutine
def request(method,host,port,ssl=False):
print("get",host)
if ssl:
port = 443
reader,writer = yield from asyncio.open_connection(host=host,port=port,ssl=ssl)
# 发送请求头数据
request_headers = """%s / HTTP/1.0\r\nHost: %s\r\nUser-agent: %s\r\n\r\n""" % (method.upper(),host, user_agent)
print(request_headers)
# 这是把数据交给操作系统
writer.write(request_headers.encode("utf-8"))
# 真正的发送数据
yield from writer.drain()
# 接收服务器返回的数据
# 响应头
while True:
line = yield from reader.readline()
if line == b"\r\n":
break
# print(host,"headers: ",line)
# 响应体
body = yield from reader.read()
# print(body)
writer.close() # 关闭writer 对方会自动关闭
loop = asyncio.get_event_loop()
urls = ["www.python.org","www.qq.com","www.taobao.com"]
ts = [request("get",i,80,True) for i in urls]
loop.run_until_complete(asyncio.wait(ts))
loop.close()
asyncio+requests
import requests,asyncio
"""
"""
@asyncio.coroutine
def task(url):
print(url)
# 获取loop
loop = asyncio.get_event_loop()
res = yield from loop.run_in_executor(None,requests.get,url)
# res = requests.get(url)
print(res.text)
loop = asyncio.get_event_loop()
urls = ["https://www.python.org","https://www.qq.com","https://www.taobao.com"]
ts = [task(i) for i in urls]
loop.run_until_complete(asyncio.wait(ts))
loop.close()
grequests模块
import grequests
urls = ["https://www.python.org","https://www.qq.com","https://www.taobao.com"]
# ts = []
# for i in urls:
# res = grequests.request("get", i)
#
# print(res)
ts = [grequests.request("get",i) for i in urls]
res = grequests.map(ts)
print(res)
tornado模块
from tornado.httpclient import AsyncHTTPClient
from tornado.httpclient import HTTPRequest
from tornado import ioloop
count=0
def handle_response(response):
"""
处理返回值内容(需要维护计数器,来停止IO循环),调用 ioloop.IOLoop.current().stop()
:param response:
:return:
"""
if response.error:
print("Error:", response.error)
else:
print(len(response.body))
global count
count-=1 #完成一次回调,计数减1
if count == 0:
ioloop.IOLoop.current().stop()
def func():
url_list = [
'http://www.baidu.com',
'http://www.bing.com',
]
global count
for url in url_list:
print(url)
http_client = AsyncHTTPClient()
http_client.fetch(HTTPRequest(url), handle_response)
count+=1 #计数加1
ioloop.IOLoop.current().add_callback(func)
ioloop.IOLoop.current().start()
twisted模块
from twisted.web.client import getPage,defer
from twisted.internet import reactor
def all_done(arg):
reactor.stop()
def callback(res):
print(res)
return 1
defer_list=[]
urls=[
'http://www.baidu.com',
'http://www.bing.com',
'https://www.python.org',
]
for url in urls:
print(url)
obj=getPage(url.encode('utf=-8'),)
obj.addCallback(callback)
defer_list.append(obj)
# defer_list任务列表 addBoth设置任务完成的回调
defer.DeferredList(defer_list).addBoth(all_done)
reactor.run()