Day-24 微信开发 & 高性能相关 & 爬虫scrapy框架
零、回顾
Http协议:
Http协议是什么
一套规则的字符串,GET / http1.1/r/n....../r/n/r/n
TCP协议,使用socket的sendall("GET / http1.1/r/n....../r/n/r/na=1")
GET:没有请求体
POST:有请求体
请求体:
GET:GET / http1.1/r/n....../r/n/r/na=1&b=2
POST:POST / http1.1/r/n....../r/n/r/n{"k1":123} # POST请求,请求体内容可自定义
ps:请求体格式,通过请求头中的Content-Type标识,服务端接收数据对应不同数据接收处理机制也不同
requests模块
- method
- url
- params
- data
- json
- headers
- cookies
- proxies
BeautifulSoup4模块
解析HTML,XML
Web微信
- 轮询
- 长轮询
一、Web微信
知识点:
- 防盗链
- headers
- cookies
- 检测请求
- url
- Session中:
- qcode
- ctime
- login_cookie_dict
- ticket_dict_cookie
- init_cookie_dict
- 收发消息
实例
- views.py
由于使用到了session,需要python manage.py migrate 生成表
from django.shortcuts import render,HttpResponse,redirect
import requests
import re
import time
import json
#
def ticket(html):
from bs4 import BeautifulSoup
ret = {}
soup = BeautifulSoup(html,'html.parser')
for tag in soup.find(name='error').find_all():
ret[tag.name] = tag.text
return ret
#
def login(req):
ctime = int(time.time() * 1000)
# https://login.wx.qq.com/jslogin?appid=wx782c26e4c19acffb&redirect_uri=https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxnewloginpage&fun=new&lang=zh_CN&_=1509346926556
qcode_url = "https://login.wx.qq.com/jslogin?appid=wx782c26e4c19acffb&redirect_uri=https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxnewloginpage&fun=new&lang=zh_CN&_={0}"
qcode_url = qcode_url.format(ctime)
r1 = requests.get(qcode_url)
data = re.findall('uuid = "(.*)";',r1.text)
uuid = data[0] if data else ""
req.session['UUID_TIME'] = ctime
req.session['UUID'] = uuid
return render(req,'login.html',locals())
#
def check_login(req):
# 由于使用到了session,需要python manage.py migrate 生成表
tip = req.GET.get('tip')
response = {'code':'408','data':None}
ctime = int(time.time() * 1000)
base_login_url = "https://login.wx.qq.com/cgi-bin/mmwebwx-bin/login?loginicon=true&uuid={0}&tip={1}&r=-950458844&_={2}"
login_url = base_login_url.format(req.session['UUID'],tip,ctime)
r1 = requests.get(login_url) # 被夯住30s
if 'window.code=408' in r1.text:
# 无人扫码
response['code'] = 408
elif 'window.code=201' in r1.text:
# 扫码成功,返回头像
response['code'] = 201
avatar = re.findall("window.userAvatar = '(.*)';",r1.text)
response['data'] = avatar[0] if avatar else ""
elif 'window.code=200' in r1.text:
# 扫码,确认登录
# window.redirect_url = https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxnewloginpage?ticket=AVRlYkM4k3YRjZWjrXjWFTaD@qrticket_0&uuid=Acj-qXAfKA==&lang=zh_CN&scan=1508486099&fun=new&version=v2&lang=zh_CN
# 登录过程加入,将微信返回的cookie返回给本地
req.session['LOGIN_COOKIE'] = r1.cookies.get_dict()
base_redirect_url = re.findall('window.redirect_uri="(.*)";', r1.text)[0]
fun_version = '&fun=new&version=v2&lang=zh_CN'
# 扫码返回的跳转地址,实际上是跳转地址的是加上&fun=new&version=v2&lang=zh_CN的
redirect_url = base_redirect_url + fun_version
# 访问redirect_url,会获取response中包含XML格式的凭证
# <error>
# <ret>0</ret>
# <message></message>
# <skey>@crypt_f3698ba3_a4587202cfd2373c1852e410e9edb468</skey>
# <wxsid>C301v3Pi0IT1ApDe</wxsid>
# <wxuin>1688082541</wxuin>
# <pass_ticket>VoJYSIvwQ9LtZjf9Rcmt50hrKkWSldts2jK5Nj%2B6464uHAA2YPd1QTJKNhuk%2BoHV</pass_ticket>
# <isgrayscale>1</isgrayscale>
# </error>
r2 = requests.get(redirect_url)
ticket_dict = ticket(r2.text)
req.session['TICKED_DICT'] = ticket_dict
req.session['TICKED_COOKIE'] = r2.cookies.get_dict()
response['code'] = 200
# status_code = r1.text
# status_code等于408,表示用户30未扫码
# status_code等于302,表示用户已经扫码,把用户头像返回
# status_code等于?200,表示用户已经扫码,点击确认登录
return HttpResponse(json.dumps(response))
#
def index(req):
"""
显示最近联系人
:param req:
:return:
"""
if not req.session.get('TICKED_DICT'):
return redirect('/login.html/')
# requests.post(data='xxx') # FormData
# requests.post(json='xxx') # RequestsPayload
# 初始化微信web信息,获取最近联系人,公众号,将最近联系人个人信息放入session中
# https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxinit?r=-952560655&lang=zh_CN&pass_ticket=VoJYSIvwQ9LtZjf9Rcmt50hrKkWSldts2jK5Nj%252B6464uHAA2YPd1QTJKNhuk%252BoHV
ticket_dict = req.session['TICKED_DICT']
# init_url = "https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxinit?r=-952560655&lang=zh_CN&pass_ticket={0}"
init_url = "https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxinit?r=-1930201634&lang=zh_CN&pass_ticket={0}"
init_url = init_url.format(ticket_dict['pass_ticket'])
post_data = {
"BaseRequest":{
'DeviceID':'e385574362482605',
'Sid':ticket_dict['wxsid'], # 版本不同wxsid名称也许有变化
'Skey':ticket_dict['skey'],
'Uin':ticket_dict['wxuin'],
}
}
# requests.post(init_url,json=post_data)
response = requests.post(init_url,data=json.dumps(post_data),headers={'Content-Type':'application/json:charset=utf-8'})
response.encoding = 'utf-8'
init_dict = json.loads(response.text)
req.session['init_cookie_dict'] = response.cookies.get_dict()
sync_key = init_dict.pop('SyncKey')
req.session['sync_key'] = sync_key
req.session['init_dict'] = init_dict
return render(req,'index.html',locals())
#
def avatar(req):
"""
获取用户头像,不同网站针对用户获取用户头像又不同措施,
本次示例,请求头中应包含Referer信息(https://wx.qq.com),cookie
:param req:
:return:
"""
base_url = 'https://wx.qq.com'
prev = req.GET.get('prev') # 获取地址中的&符号会把地址截断
username = req.GET.get('username') # 获取地址中的&符号会把地址截断
skey = req.GET.get('skey') # 获取地址中的&符号会把地址截断
img_url = '{0}{1}&username={2}&skey={3}'.format(base_url,prev, username, skey)
cookies = {}
cookies.update(req.session['LOGIN_COOKIE'])
cookies.update(req.session['TICKED_COOKIE'])
cookies.update(req.session['init_cookie_dict'])
res = requests.get(img_url, cookies=cookies, headers={'Content-Type': 'image/jpeg'})
with open('a.jpg', 'ab') as f:
f.write(res.content) # text字符串类型,content字节类型
return HttpResponse(res.content) # 返回字节形式,socket本身发送的就是字节
#
def contact_list(request):
"""
获取所有联系人
:param req:
:return:
"""
ctime = int(time.time() * 1000)
base_url = "https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxgetcontact?lang=zh_CN&r={0}&seq=0&skey={1}"
url = base_url.format(ctime,request.session['TICKED_DICT']['skey'])
cookies={}
cookies.update(request.session['LOGIN_COOKIE'])
cookies.update(request.session['TICKED_COOKIE'])
cookies.update(request.session['init_cookie_dict'])
r1 = requests.get(url,cookies=cookies)
r1.encoding = 'utf-8'
user_list = json.loads(r1.text)
# print(r1.text)
return render(request,'contact_list.html',{'user_list':user_list})
#
def send_msg(req):
ticket_dict = req.session['TICKED_DICT']
ctime = time.time() * 1000
to_user = req.GET.get('to_user')
msg = req.GET.get('msg')
pass_ticket = ticket_dict['pass_ticket']
send_msg_url = 'https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsendmsg?pass_ticket={0}'.format(pass_ticket)
# https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsendmsg?pass_ticket=QgKUtkx2SjxYNGEVcvgNpF28X2bI3AIaFgIDyakcYWlgtbFrfWo%2BUPCJdDB8ttua
# POST方式
post_data = {
"BaseRequest":{
'DeviceID':'e385574362482605',
'Sid':ticket_dict['wxsid'],
'Skey':ticket_dict['skey'],
'Uin':ticket_dict['wxuin'],
},
"Msg":{
'ClientMsgId':ctime,
'LocalID':ctime,
'Content':msg,
'FromUserName':req.session['init_dict']['User']['UserName'],
'ToUserName':to_user,
'Type':1
},
"Scene":0
}
cookies = {}
cookies.update(req.session['LOGIN_COOKIE'])
cookies.update(req.session['TICKED_COOKIE'])
cookies.update(req.session['init_cookie_dict'])
# reponse = requests.post(url=send_msg_url,json=post_data,)
reponse = requests.post(url=send_msg_url,data=json.dumps(post_data,ensure_ascii=False).encode('utf-8'),headers={'Content-Type':'application/json'},cookies=cookies)
return HttpResponse('OK!')
#
def getMsg(req):
time.sleep(4.38)
ctime = time.time() * 1000
ticket_dict = req.session['TICKED_DICT']
sync_key = req.session['sync_key']
sync_key_list = []
for item in sync_key['List']:
tpl = "%s_%s" %(item['Key'],item['Val'])
sync_key_list.append(tpl)
resync_key = '|'.join(sync_key_list)
get_msg_url = 'https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsync?sid={0}&skey={1}&lang=zh_CN'.format(ticket_dict['wxsid'],ticket_dict['skey'])
params_dict = {
'r':ctime,
'skey':ticket_dict['skey'],
'sid':ticket_dict['wxsid'],
'uin':ticket_dict['wxuin'],
'deviceid':'e385574362482605',
'synckey':resync_key,
'_':ctime,
}
cookies={}
cookies.update(req.session['LOGIN_COOKIE'])
cookies.update(req.session['TICKED_COOKIE'])
cookies.update(req.session['init_cookie_dict'])
# 检测是否有好友发送消息
# https://webpush.wx.qq.com/cgi-bin/mmwebwx-bin/synccheck?r=1509460935015&skey=@crypt_f3698ba3_e81540305795b5c7354289b5f8c0f5ab&sid=IW72drK8Yeq+6Ksu&uin=1688082541&deviceid=e294959645380764&synckey=1_656159980|2_656160186|3_656160143|1000_1509445082&_=1509459994331
response = requests.get('https://webpush.wx.qq.com/cgi-bin/mmwebwx-bin/synccheck',params=params_dict,cookies=cookies)
# 获取新消息
# https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsync?sid=sUbFAe2FbEZBL/f2&skey=@crypt_f3698ba3_0702b1dd1c87ad751ca72d088ab4b7ff&lang=zh_CN
# https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsync?sid=sUbFAe2FbEZBL/f2&skey=@crypt_f3698ba3_0702b1dd1c87ad751ca72d088ab4b7ff&lang=zh_CN
post_data = {
"BaseRequest":{
'DeviceID':'e385574362482605',
'Sid':ticket_dict['wxsid'], # 版本不同wxsid名称也许有变化
'Skey':ticket_dict['skey'],
'Uin':ticket_dict['wxuin'],
},
'SyncKey':sync_key,
'rr':'-1928629916',
}
msg_response = requests.post(get_msg_url,json=post_data,cookies=cookies)
msg_response.encoding = 'utf-8'
msg_dict = json.loads(msg_response.text)
print(response.text)
if 'window.synccheck={retcode:"0",selector:"2"}' in response.text:
# 获取最新消息
for row in msg_dict['AddMsgList']:
print(row.get('Content'))
else:
print('-'*200)
req.session['sync_key'] = msg_dict['SyncKey']
return HttpResponse('...')
#
def api(request):
"""
把该url用作api,如服务器报警,
延伸:只能会操作,如回复什么信息,
服务器可以有相应的操作,如停掉一个服务
:param request:
:return:
"""
name = request.GET.get('name')
msg = request.GET.get('msg')
# 获取状态码,302显示用户头像
# 获取最近联系人,获取订阅号信息,获取联系人列表,给联系人发送微信,接收联系人发送的微信
- login.html
<body>
<img id="img" style="width: 200px;height: 200px" src="https://login.weixin.qq.com/qrcode/{{ uuid }}">
<script src="/static/jquery-3.2.1.js"></script>
<script>
TIP = 1; {# 进入页面时TIP为1,之后每次长连接返回TIP为0 #}
$(function () {
checkLogin();
});
function checkLogin() {
//向后台发送请求,浏览器有同源策略,跨域的问题,所以需要发送给后台,后台在向微信服务器发送请求
$.ajax({
url:'/check_login.html/',
type:'GET',
data:{'tip':TIP},
dataType:'JSON',
success:function (arg) {
if (arg.code == 408){
//如果返回408,表示没有人扫码继续发送长连接
if (arg.tip == 1){
TIP = 0;
}
checkLogin();
}else if(arg.code == 201){
//如果返回201,表示手机已经扫码返回头像
$('#img').attr('src',arg.data);
if (arg.tip == 0){
TIP = 1;
}else {
TIP = 0;
}
checkLogin();
}else if(arg.code == 200) {
location.href = '/index.html/'
}
}
})
}
</script>
- index.html
<body>
<h1>个人信息</h1>
<img src="/avatar.html?prev={{ init_dict.User.HeadImgUrl }}/">
<h2>{{ init_dict.User.NickName }}</h2>
<h1>最近联系人</h1>
<ul>
{% for user in init_dict.ContactList %}
{# 看不到头像设置了防倒链,浏览器发送的Referer为127.0.0.1:8000,需求获取refer请求头(wx.qq.com #}
{# 通过自己的后台方法访问获取头像 #}
<li><img style="width: 50px;height: 50px;" src="/avatar.html?prev={{ user.HeadImgUrl }}/"> {{ user.NickName }} / 唯一标识:【{{ user.UserName }}】</li>
{% endfor %}
</ul>
<a href="/contact_list.html/">更多联系人</a>
<h1>公众号信息</h1>
</body>
- contact_list.html
<body>
<div>
<h3>发送信息</h3>
<p>接收者:<input type="text" id="to_user"></p>
<p>消息内容:<input type="text" id="msg"></p>
<input type="button" value="发送" id="btn">
</div>
<div>用户列表</div>
<h3>个人信息</h3>
<ul>
<li>{{ request.session.init_dict.User.NickName }}</li>
<li>{{ request.session.init_dict.User.UserName }}</li>
</ul>
<div>{{ user_list.MemberCount }}</div>
{% for user in user_list.MemberList %}
<div username="{{ user.UserNanme }}">
{# <img style='width: 50px;height: 50px' src="/avatar.html?prev={{ user.HeadImgUrl }}/">#}
<span>{{ user.NickName }}---->{{ user.UserName }}</span>
</div>
{% endfor %}
<script src="/static/jquery-3.1.1.js"></script>
<script>
$(function () {
bindBtnEvent();
getMsg();
});
function bindBtnEvent() {
$('#btn').click(function () {
$.ajax({
url:'/send_msg.html',
type:'GET',
data:{'to_user':$('#to_user').val(),'msg':$('#msg').val()},
success:function (arg) {
console.log(arg);
}
})
});
}
function getMsg() {
$.ajax({
url:'/getMsg.html/',
type:'GET',
success:function (arg) {
getMsg();
},
error:function () {
getMsg();
}
})
}
</script>
</body>
- urls.py
from django.conf.urls import url
from django.contrib import admin
from app01 import views
urlpatterns = [
url(r'^admin/', admin.site.urls),
url(r'^login.html/$',views.login),
url(r'^check_login.html/$',views.check_login),
url(r'index.html/$',views.index),
url(r'avatar.html/$',views.avatar),
url(r'contact_list.html/$',views.contact_list),
url(r'^send_msg.html/$',views.send_msg),
url(r'^getMsg.html/$',views.getMsg),
url(r'^api.html/$',views.api),
]
二、高性能异步IO
传统并发。
使用线程池或者进程池,但存在资源浪费的情况。
python2中没有线程池概念。
python3中又有线程也有进程。
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
import requests
# python2 没有线程池,有进程池
# python3 有线程池,有进程池
def task(url):
response = requests.get(url)
print(response.content)
pool = ThreadPoolExecutor(10) # 线程池
# pool = ProcessPoolExecutor(4) # 进程池
url_list = ['http://www.baidu.com','http://www.bing.com']
for url in url_list:
v = pool.submit(task,url)
pool.shutdown(wait=True)
异步非阻塞
非阻塞:不等待
异步:回调
1个线程完成并发操作
- async模块
import asyncio
import requests
@asyncio.coroutine
def fetch_async(func, *args):
loop = asyncio.get_event_loop()
future = loop.run_in_executor(None, func, *args)
response = yield from future
print(response.url, response.content)
tasks = [
fetch_async(requests.get, 'http://www.cnblogs.com/wupeiqi/'),
fetch_async(requests.get, 'http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091')
]
loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
- gevent模块
import gevent
import requests
from gevent import monkey
monkey.patch_all()
def fetch_async(method, url, req_kwargs):
print(method, url, req_kwargs)
response = requests.request(method=method, url=url, **req_kwargs)
print(response.url, response.content)
# ##### 发送请求 #####
gevent.joinall([
gevent.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),
gevent.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),
gevent.spawn(fetch_async, method='get', url='https://github.com/', req_kwargs={}),
])
# ##### 发送请求(协程池控制最大协程数量) #####
# from gevent.pool import Pool
# pool = Pool(None)
# gevent.joinall([
# pool.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),
# pool.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),
# pool.spawn(fetch_async, method='get', url='https://www.github.com/', req_kwargs={}),
# ])
- Twisted模块
from twisted.web.client import getPage, defer
from twisted.internet import reactor
def all_done(arg):
reactor.stop()
def callback(contents):
print(contents)
deferred_list = []
url_list = ['http://www.bing.com', 'http://www.baidu.com', ]
for url in url_list:
deferred = getPage(bytes(url, encoding='utf8'))
deferred.addCallback(callback)
deferred_list.append(deferred)
dlist = defer.DeferredList(deferred_list)
dlist.addBoth(all_done)
reactor.run()
- 抓取网站链接,异步IO获取页面html
from bs4 import BeautifulSoup
import requests
import asyncio
import time
import json
html_list = []
@asyncio.coroutine
def fetch_async(func, *args):
loop = asyncio.get_event_loop()
future = loop.run_in_executor(None, func, *args)
response = yield from future
content = response.content
content = content.decode('utf-8','ignore')
html_list.append({response.url:content,})
url = 'http://www.hao123.com'
html = requests.get(url)
html = html.content
html = html.decode('utf-8')
soup = BeautifulSoup(html,'html.parser')
a = soup.find_all(name='a')
tasks = []
print(time.ctime())
for href in a:
href = str(href.get('href'))
if href[:4] == 'http':
task = fetch_async(requests.get,href)
tasks.append(task)
loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
f = open('abc.txt','a')
for i in html_list:
f.write(json.dumps(i)+"\n")
f.close()
print(time.ctime())
高性能先自定义异步IO模块示例
IO多路复用:
select,用于检测socket对象是否发生变化(是否连接成功,是否有数据到来),变化后扫描函数中是否有响应的操作
它通过一个select()系统调用来监视多个文件描述符的数组,当select()返回后,该数组中就绪的文件描述符便会被内核修改标志位,使得进程可以获得这些文件描述符从而进行后续的读写操作。
Socket:
在socket客户端
使用一个线程完成并发操作,
当第一个任务到来时,
先发送连接请求,
此时发生IO等待,
但是不等待,
继续发送第二个任务的连接请求...
IO多路复用监听socket变化
先连接成功:
发送请求信息: GET / http/1.0\r\nhost...
遇到IO等待,不等待,继续监测是否有人连接成功:
有结果返回:
处理结果,读取返回的内容,执行回调函数
# s1.py
import socket
import select
class Request(object):
def __init__(self,sock,func,url):
self.sock = sock
self.func = func
self.url = url
def fileno(self):
return self.sock.fileno()
def async_request(url_list):
input_list = []
conn_list = []
for url in url_list:
client = socket.socket()
client.setblocking(False) # 使socket连接非阻塞,会报错
try:
client.connect((url[0],80,)) # 创建连接,默认阻塞
except BlockingIOError as e:
pass
obj = Request(client,url[1],url[0])
input_list.append(obj)
conn_list.append(obj)
while True:
# 监听socket是否发生变化
# 如果有其你去连接成功:wlist = [client,]
# 如果有响应的数据:rlist = [client,]
rlist,wlist,elist = select.select(input_list,conn_list,[],0.05)
# print(rlist)
# print(wlist)
for request_obj in wlist: # 连接成功发消息
# print(conn)
# print('连接成功')
# print('发送请求')
request_obj.sock.sendall("GET / HTTP/1.1\r\nhost:{0}\r\n\r\n".format(request_obj.url).encode('utf-8'))
conn_list.remove(request_obj)
for request_obj in rlist: # 发送后返回的消息
# print('接收到返回值')
data = request_obj.sock.recv(8096)
request_obj.func(data)
request_obj.sock.close()
input_list.remove(request_obj)
if not input_list:
break
# 发送Http请求
# client.sendall(b"GET / HTTP/1.0\r\nhost:www.baidu.com\r\n\r\n")
# data = client.recv(8096) # 获取响应,假设可接收所有返回信息
client.close()
# s2.py
import s1
def callback1(data):
print(data)
def callback2(data):
print(data)
input_list = []
conn_list = []
url_list = [
['www.baidu.com',callback1],
['www.cnblog.com',callback2],
]
s1.async_request(url_list)
协程
协程定义:
单纯的执行一段代码后切换,调到另外一段代码执行,在继续跳...。
异步IO:
[基于协程]可以用协程+非阻塞socket(gevent)
或
[基于事件循环]完全同过socket+select(IO多路复用)实现(Twsited,tornado)
-
如何提高爬虫并发
利用异步IO模块,如:asyncio,twisted,gevent
本质:基于协程,或基于事件循环 -
异步非阻塞
异步:回调 select
费阻塞:不等待 setblocking(False) -
什么是协程?
from greenlet import greenlet
def test1():
print(12)
gr2.swich()
print(34)
gr2.swich()
def test2():
print(56)
gr1.swich()
print(78)
gr1 = greenlet(test1)
gr2 = greenlet(test2)
gr1.swich()
三、scrapy框架
脚本类的爬虫可以使用request+bs4+twisted或gevent或asyncio实现
scrapy框架功能
下载页面--->使用twisted
html解析--->使用scrapy内容方法
限速--->伪造认为方式爬取信息
去重--->爬过的url不去爬
递归--->递归爬取连接,并可限制最多爬几层页面
代理--->支持代理
https--->支持代理
中间件--->支持中间件
scrapy框架结构
1.scrpapy组件
- 引擎(Scrapy)
用来处理整个系统的数据流处理, 触发事务(框架核心) - 调度器(Scheduler)
用来接受引擎发过来的请求, 压入队列中, 并在引擎再次请求的时候返回. 可以想像成一个URL(抓取网页的网址或者说是链接)的优先队列, 由它来决定下一个要抓取的网址是什么, 同时去除重复的网址 - 下载器(Downloader)
用于下载网页内容, 并将网页内容返回给蜘蛛(Scrapy下载器是建立在twisted这个高效的异步模型上的) - 爬虫(Spiders)
爬虫是主要干活的, 用于从特定的网页中提取自己需要的信息, 即所谓的实体(Item)。用户也可以从中提取出链接,让Scrapy继续抓取下一个页面
项目管道(Pipeline)
负责处理爬虫从网页中抽取的实体,主要的功能是持久化实体、验证实体的有效性、清除不需要的信息。当页面被爬虫解析后,将被发送到项目管道,并经过几个特定的次序处理数据。 - 下载器中间件(Downloader Middlewares)
位于Scrapy引擎和下载器之间的框架,主要是处理Scrapy引擎与下载器之间的请求及响应。 - 爬虫中间件(Spider Middlewares)
介于Scrapy引擎和爬虫之间的框架,主要工作是处理蜘蛛的响应输入和请求输出。 - 调度中间件(Scheduler Middewares)
介于Scrapy引擎和调度之间的中间件,从Scrapy引擎发送到调度的请求和响应。
- Scrapy运行流程
引擎从调度器中取出一个链接(URL)用于接下来的抓取
引擎把URL封装成一个请求(Request)传给下载器
下载器把资源下载下来,并封装成应答包(Response)
爬虫解析Response
解析出实体(Item),则交给实体管道进行进一步的处理
解析出的是链接(URL),则把URL交给调度器等待抓取
Spiders(爬虫):
看做是一个类,其中有url,和自定义的回调函数
会有一个,更多的是多个Spiders依次执行
如果返回数据需要继续操作(如访问下一级页面获取数据),则重复开头动作
如果返回得到需求结果,则交给item Pipeline做数据持久化
Scarpy Engine将URL请求放在Scheduler任务调度器中
Scheduler(调度器):
Scheduler中包含任务队列,每个任务为一个对象其中包括请求URL和回调函数名
Downloader(下载器):
Downloader去Scheduler里取任务下载,利用twisted去远程发送请求,返回结果
作为参数,传给Spiders执行回调函数。
item Pipeline(数据持久化):
回调函数执行成功的结果,传递给item Pipeline做数据持久化
item Pipeline里定义数据持久化的操作
安装scrapy
Linux
pip3 install scrapy
Windows
a. pip3 install wheel
b. 下载twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
c. 进入下载目录,执行 pip3 install Twisted‑17.1.0‑cp35‑cp35m‑win_amd64.whl
d. pip3 install scrapy
e. 下载并安装pywin32:https://sourceforge.net/projects/pywin32/files/
基本操作
-
scrapy startproject 项目名称
- 在当前目录中创建中创建一个项目文件(类似于Django)
-
scrapy genspider [-t template]
- 创建爬虫应用
如:
scrapy gensipider -t basic oldboy oldboy.com
scrapy gensipider -t xmlfeed autohome autohome.com.cn
PS:
查看所有命令:scrapy gensipider -l
查看模板命令:scrapy gensipider -d 模板名称
- 创建爬虫应用
-
scrapy list
- 展示爬虫应用列表
-
scrapy crawl 爬虫应用名称
- 运行单独爬虫应用
scrapy项目结构
project_name/
scrapy.cfg
project_name/
__init__.py
items.py # 和pipline共同做持久化,指定规则
middlewares.py # 中间件
pipelines.py # 持久化
settings.py # 配置文件
spiders/ # 具体爬虫内容
__init__.py
爬虫1.py
爬虫2.py
爬虫3.py
文件说明:
scrapy.cfg 项目的主配置信息。(真正爬虫相关的配置信息在settings.py文件中)
items.py 设置数据存储模板,用于结构化数据,如:Django的Model
pipelines 数据处理行为,如:一般结构化的数据持久化
settings.py 配置文件,如:递归的层数、并发数,延迟下载等
spiders 爬虫目录,如:创建文件,编写爬虫规则
爬虫文件
# chouti.py
import scrapy
import io,os,sys
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') # gb18030可以是亚洲多种文字的编码
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://chouti.com/']
#
def parse(self, response):
print(response.text)
# print(response.body)
# cmd : scrapy crawl chouti --nolog
选择器
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from scrapy.selector import Selector, HtmlXPathSelector
from scrapy.http import HtmlResponse
html = """<!DOCTYPE html>
<html>
<head lang="en">
<meta charset="UTF-8">
<title></title>
</head>
<body>
<ul>
<li class="item-"><a id='i1' href="link.html">first item</a></li>
<li class="item-0"><a id='i2' href="llink.html">first item</a></li>
<li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
</ul>
<div><a href="llink2.html">second item</a></div>
</body>
</html>
"""
response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8')
# hxs = HtmlXPathSelector(response)
# print(hxs)
# hxs = Selector(response=response).xpath('//a')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[2]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@id]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@id="i1"]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[contains(@href, "link")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first()
# print(hxs)
#
# ul_list = Selector(response=response).xpath('//body/ul/li')
# for item in ul_list:
# v = item.xpath('./a/span')
# # 或
# # v = item.xpath('a/span')
# # 或
# # v = item.xpath('*/a/span')
# print(v)
pipeline(持续存储)
# /sp1/sp1/spiders/chouti.py
import scrapy
import io,os,sys
from scrapy import Request
from scrapy.selector import HtmlXPathSelector # 等同于bs4模块
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
from ..items import Sp1Item
class ChoutiSpider(scrapy.Spider):
name = 'chouti' # 执行爬虫时,根据名称爬取,没有程序报错
allowed_domains = ['chouti.com'] # 其他url不许爬取
start_urls = ['http://dig.chouti.com/'] # 起始url写多少个都行,默认执行parse方法,他的回调函数是parse
# def start_requests(self): # 注释start_urls,可自定义请求头,cookie等
# yield Request(url='http://dig.chouti.com/',headers={},callback=self.parse)
def parse(self, response):
# print(response.text)
hxs = HtmlXPathSelector(response) # 生成hxs对象,用作匹配条件
item_list = hxs.select('//div[@id="content-list"]/div[@class="item"]')
# print(item_list)
for item in item_list:
# item.select('./div[@class="new-content"]/div[@class="part2"]/text()')
# title = item.select('./div[@class="news-content"]/div[@class="part2"]/@share-title').extract()
url = item.select('./div[@class="news-content"]/div[@class="part2"]/@share-pic').extract_first()
title = item.select('./div[@class="news-content"]/div[@class="part2"]/@share-title').extract_first()
# print(v)
obj = Sp1Item(title=title,url=url)
yield obj # yield Item对象会自动将值转交给pipeline处理
#
# /sp1/sp1/items.py
import scrapy
class Sp1Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field() # 定义Item
url = scrapy.Field() # 定义Item
#
# /sp1/sp1/pipelines.py
class Sp1Pipeline(object):
def __init__(self,file_path):
self.file_path = file_path
self.file_obj = None
@classmethod
def from_crawler(cls,crawler):
"""
初始化时候,用于创建pipeline对象
:param crawler:
:return:返回对象
"""
val = crawler.settings.get('XXX') # 配置文件settings获取配置
return cls(val)
def process_item(self, item, spider):
self.file_obj.write(item['url']+'\r\n')
print('pipline-->',item)
return item
def open_spider(self,spider):
"""
爬虫开始执行时,只执行一次
:param spider:
:return:
"""
self.file_obj = open(self.file_path,mode='a+') # 爬虫开始时打开文件
def close_spider(self,spider):
"""
爬虫关闭时,只执行一次
:param spider:
:return:
"""
self.file_obj.close() # 爬虫结束时关闭文件
#
# settings.py
# 解开注释才可以使用pipeline,300为优先级,可以有多个pipeline
ITEM_PIPELINES = {
'sp1.pipelines.Sp1Pipeline': 300,
}
XXX = "data.json" # 自定义
递归执行爬虫
# /sp1/sp1/spiders/chouti.py
import scrapy
import io,os,sys
from scrapy.selector import HtmlXPathSelector
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
from ..items import Sp1Item
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://dig.chouti.com/']
def parse(self, response):
# print(response.text)
hxs = HtmlXPathSelector(response)
item_list = hxs.select('//div[@id="content-list"]/div[@class="item"]')
# print(item_list)
for item in item_list:
# item.select('./div[@class="new-content"]/div[@class="part2"]/text()')
# title = item.select('./div[@class="news-content"]/div[@class="part2"]/@share-title').extract()
url = item.select('./div[@class="news-content"]/div[@class="part2"]/@share-pic').extract_first()
title = item.select('./div[@class="news-content"]/div[@class="part2"]/@share-title').extract_first()
# print(v)
obj = Sp1Item(title=title,url=url)
yield obj
# 找到所有页码标签url,并继续递归执行parse
# hxs.select('//div[@id="dig_lcpage"]//a/@href').extract
page_url_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href,"/all/hot/recent/\d+")]/@href').extract()
for url in page_url_list:
url = 'http://dig.chouti.com' + url
obj = Request(url=url,callback=self.parse) # 将url和回调函数封装,Request如需加请求头,也可加参数headers={},或cookies
yield obj # 遇到request对象,yield自动放入调度器中Scheduler
#
# settings.py
DEPTH_LIMIT = 1 # 限制递归爬虫的层数,多url的限制数量哪怕是10,结果数量也会非常庞大
ROBOTSTXT_OBEY = True # 是否遵循爬虫协议,不遵循改为False