Day-24 微信开发 & 高性能相关 & 爬虫scrapy框架

零、回顾

Http协议:

Http协议是什么
一套规则的字符串,GET / http1.1/r/n....../r/n/r/n
TCP协议,使用socket的sendall("GET / http1.1/r/n....../r/n/r/na=1")
GET:没有请求体
POST:有请求体

0@crypt_f3698ba3_ae422dd8d520d443557ad42b53141dc7YF22GXl6pZDd0HeR1688082541<pass_ticket>cqiZ59aM19bK3LLZLPkyULsZL%2FuTgK1CGAMzTOhfTIInRsXuChZPxxlMEo2yB16a</pass_ticket>1

请求体:

GET:GET / http1.1/r/n....../r/n/r/na=1&b=2
POST:POST / http1.1/r/n....../r/n/r/n{"k1":123} # POST请求,请求体内容可自定义
ps:请求体格式,通过请求头中的Content-Type标识,服务端接收数据对应不同数据接收处理机制也不同

requests模块

- method
- url
- params
- data
- json
- headers
- cookies
- proxies

BeautifulSoup4模块

解析HTML,XML

Web微信

  • 轮询
  • 长轮询

一、Web微信

知识点:

  1. 防盗链
    • headers
    • cookies
  2. 检测请求
    • url
  3. Session中:
    • qcode
    • ctime
    • login_cookie_dict
    • ticket_dict_cookie
    • init_cookie_dict
  4. 收发消息

实例

  1. views.py
    由于使用到了session,需要python manage.py migrate 生成表
from django.shortcuts import render,HttpResponse,redirect
import requests
import re
import time
import json
#
def ticket(html):
    from bs4 import BeautifulSoup
    ret = {}
    soup = BeautifulSoup(html,'html.parser')
    for tag in soup.find(name='error').find_all():
        ret[tag.name] = tag.text
    return ret
#
def login(req):
    ctime = int(time.time() * 1000)
    # https://login.wx.qq.com/jslogin?appid=wx782c26e4c19acffb&redirect_uri=https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxnewloginpage&fun=new&lang=zh_CN&_=1509346926556
    qcode_url = "https://login.wx.qq.com/jslogin?appid=wx782c26e4c19acffb&redirect_uri=https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxnewloginpage&fun=new&lang=zh_CN&_={0}"
    qcode_url = qcode_url.format(ctime)
    r1 = requests.get(qcode_url)
    data = re.findall('uuid = "(.*)";',r1.text)
    uuid = data[0] if data else ""
    req.session['UUID_TIME'] = ctime
    req.session['UUID'] = uuid
    return render(req,'login.html',locals())
#
def check_login(req):
    # 由于使用到了session,需要python manage.py migrate 生成表
    tip = req.GET.get('tip')
    response = {'code':'408','data':None}
    ctime = int(time.time() * 1000)
    base_login_url = "https://login.wx.qq.com/cgi-bin/mmwebwx-bin/login?loginicon=true&uuid={0}&tip={1}&r=-950458844&_={2}"
    login_url = base_login_url.format(req.session['UUID'],tip,ctime)
    r1 = requests.get(login_url)       # 被夯住30s
    if 'window.code=408' in r1.text:
        # 无人扫码
        response['code'] = 408
    elif 'window.code=201' in r1.text:
        # 扫码成功,返回头像
        response['code'] = 201
        avatar = re.findall("window.userAvatar = '(.*)';",r1.text)
        response['data'] = avatar[0] if avatar else ""
    elif 'window.code=200' in r1.text:
        # 扫码,确认登录
        # window.redirect_url = https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxnewloginpage?ticket=AVRlYkM4k3YRjZWjrXjWFTaD@qrticket_0&uuid=Acj-qXAfKA==&lang=zh_CN&scan=1508486099&fun=new&version=v2&lang=zh_CN
        # 登录过程加入,将微信返回的cookie返回给本地
        req.session['LOGIN_COOKIE'] = r1.cookies.get_dict()
        base_redirect_url = re.findall('window.redirect_uri="(.*)";', r1.text)[0]
        fun_version = '&fun=new&version=v2&lang=zh_CN'
        # 扫码返回的跳转地址,实际上是跳转地址的是加上&fun=new&version=v2&lang=zh_CN的
        redirect_url = base_redirect_url + fun_version
        # 访问redirect_url,会获取response中包含XML格式的凭证
        # <error>
        #     <ret>0</ret>
        #     <message></message>
        #     <skey>@crypt_f3698ba3_a4587202cfd2373c1852e410e9edb468</skey>
        #     <wxsid>C301v3Pi0IT1ApDe</wxsid>
        #     <wxuin>1688082541</wxuin>
        #     <pass_ticket>VoJYSIvwQ9LtZjf9Rcmt50hrKkWSldts2jK5Nj%2B6464uHAA2YPd1QTJKNhuk%2BoHV</pass_ticket>
        #     <isgrayscale>1</isgrayscale>
        # </error>
        r2 = requests.get(redirect_url)
        ticket_dict = ticket(r2.text)
        req.session['TICKED_DICT'] = ticket_dict
        req.session['TICKED_COOKIE'] = r2.cookies.get_dict()
        response['code'] = 200
    # status_code = r1.text
    # status_code等于408,表示用户30未扫码
    # status_code等于302,表示用户已经扫码,把用户头像返回
    # status_code等于?200,表示用户已经扫码,点击确认登录
    return HttpResponse(json.dumps(response))
#
def index(req):
    """
    显示最近联系人
    :param req:
    :return:
    """
    if not req.session.get('TICKED_DICT'):
        return redirect('/login.html/')
    # requests.post(data='xxx')   # FormData
    # requests.post(json='xxx')   # RequestsPayload
    # 初始化微信web信息,获取最近联系人,公众号,将最近联系人个人信息放入session中
    # https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxinit?r=-952560655&lang=zh_CN&pass_ticket=VoJYSIvwQ9LtZjf9Rcmt50hrKkWSldts2jK5Nj%252B6464uHAA2YPd1QTJKNhuk%252BoHV
    ticket_dict = req.session['TICKED_DICT']
    # init_url = "https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxinit?r=-952560655&lang=zh_CN&pass_ticket={0}"
    init_url = "https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxinit?r=-1930201634&lang=zh_CN&pass_ticket={0}"
    init_url = init_url.format(ticket_dict['pass_ticket'])
    post_data = {
        "BaseRequest":{
            'DeviceID':'e385574362482605',
            'Sid':ticket_dict['wxsid'],         # 版本不同wxsid名称也许有变化
            'Skey':ticket_dict['skey'],
            'Uin':ticket_dict['wxuin'],
        }
    }
    # requests.post(init_url,json=post_data)
    response = requests.post(init_url,data=json.dumps(post_data),headers={'Content-Type':'application/json:charset=utf-8'})
    response.encoding = 'utf-8'
    init_dict = json.loads(response.text)
    req.session['init_cookie_dict'] = response.cookies.get_dict()
    sync_key = init_dict.pop('SyncKey')
    req.session['sync_key'] = sync_key
    req.session['init_dict'] = init_dict
    return render(req,'index.html',locals())
#
def avatar(req):
    """
    获取用户头像,不同网站针对用户获取用户头像又不同措施,
    本次示例,请求头中应包含Referer信息(https://wx.qq.com),cookie
    :param req:
    :return:
    """
    base_url = 'https://wx.qq.com'
    prev = req.GET.get('prev')  # 获取地址中的&符号会把地址截断
    username = req.GET.get('username')  # 获取地址中的&符号会把地址截断
    skey = req.GET.get('skey')  # 获取地址中的&符号会把地址截断
    img_url = '{0}{1}&username={2}&skey={3}'.format(base_url,prev, username, skey)
    cookies = {}
    cookies.update(req.session['LOGIN_COOKIE'])
    cookies.update(req.session['TICKED_COOKIE'])
    cookies.update(req.session['init_cookie_dict'])
    res = requests.get(img_url, cookies=cookies, headers={'Content-Type': 'image/jpeg'})
    with open('a.jpg', 'ab') as f:
        f.write(res.content)          # text字符串类型,content字节类型
    return HttpResponse(res.content)  # 返回字节形式,socket本身发送的就是字节
#
def contact_list(request):
    """
    获取所有联系人
    :param req:
    :return:
    """
    ctime = int(time.time() * 1000)
    base_url = "https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxgetcontact?lang=zh_CN&r={0}&seq=0&skey={1}"
    url = base_url.format(ctime,request.session['TICKED_DICT']['skey'])
    cookies={}
    cookies.update(request.session['LOGIN_COOKIE'])
    cookies.update(request.session['TICKED_COOKIE'])
    cookies.update(request.session['init_cookie_dict'])
    r1 = requests.get(url,cookies=cookies)
    r1.encoding = 'utf-8'
    user_list = json.loads(r1.text)
    # print(r1.text)
    return render(request,'contact_list.html',{'user_list':user_list})
#
def send_msg(req):
    ticket_dict = req.session['TICKED_DICT']
    ctime = time.time() * 1000
    to_user = req.GET.get('to_user')
    msg = req.GET.get('msg')
    pass_ticket = ticket_dict['pass_ticket']
    send_msg_url = 'https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsendmsg?pass_ticket={0}'.format(pass_ticket)
    # https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsendmsg?pass_ticket=QgKUtkx2SjxYNGEVcvgNpF28X2bI3AIaFgIDyakcYWlgtbFrfWo%2BUPCJdDB8ttua
    # POST方式
    post_data = {
        "BaseRequest":{
            'DeviceID':'e385574362482605',
            'Sid':ticket_dict['wxsid'],
            'Skey':ticket_dict['skey'],
            'Uin':ticket_dict['wxuin'],
        },
        "Msg":{
            'ClientMsgId':ctime,
            'LocalID':ctime,
            'Content':msg,
            'FromUserName':req.session['init_dict']['User']['UserName'],
            'ToUserName':to_user,
            'Type':1
        },
        "Scene":0
    }
    cookies = {}
    cookies.update(req.session['LOGIN_COOKIE'])
    cookies.update(req.session['TICKED_COOKIE'])
    cookies.update(req.session['init_cookie_dict'])
    # reponse = requests.post(url=send_msg_url,json=post_data,)
    reponse = requests.post(url=send_msg_url,data=json.dumps(post_data,ensure_ascii=False).encode('utf-8'),headers={'Content-Type':'application/json'},cookies=cookies)
    return HttpResponse('OK!')
#
def getMsg(req):
    time.sleep(4.38)
    ctime = time.time() * 1000
    ticket_dict = req.session['TICKED_DICT']
    sync_key = req.session['sync_key']
    sync_key_list = []
    for item in sync_key['List']:
        tpl = "%s_%s" %(item['Key'],item['Val'])
        sync_key_list.append(tpl)
    resync_key = '|'.join(sync_key_list)
    get_msg_url = 'https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsync?sid={0}&skey={1}&lang=zh_CN'.format(ticket_dict['wxsid'],ticket_dict['skey'])
    params_dict = {
        'r':ctime,
        'skey':ticket_dict['skey'],
        'sid':ticket_dict['wxsid'],
        'uin':ticket_dict['wxuin'],
        'deviceid':'e385574362482605',
        'synckey':resync_key,
        '_':ctime,
    }
    cookies={}
    cookies.update(req.session['LOGIN_COOKIE'])
    cookies.update(req.session['TICKED_COOKIE'])
    cookies.update(req.session['init_cookie_dict'])
    # 检测是否有好友发送消息
    # https://webpush.wx.qq.com/cgi-bin/mmwebwx-bin/synccheck?r=1509460935015&skey=@crypt_f3698ba3_e81540305795b5c7354289b5f8c0f5ab&sid=IW72drK8Yeq+6Ksu&uin=1688082541&deviceid=e294959645380764&synckey=1_656159980|2_656160186|3_656160143|1000_1509445082&_=1509459994331
    response = requests.get('https://webpush.wx.qq.com/cgi-bin/mmwebwx-bin/synccheck',params=params_dict,cookies=cookies)
    # 获取新消息
    # https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsync?sid=sUbFAe2FbEZBL/f2&skey=@crypt_f3698ba3_0702b1dd1c87ad751ca72d088ab4b7ff&lang=zh_CN
    # https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxsync?sid=sUbFAe2FbEZBL/f2&skey=@crypt_f3698ba3_0702b1dd1c87ad751ca72d088ab4b7ff&lang=zh_CN
    post_data = {
        "BaseRequest":{
            'DeviceID':'e385574362482605',
            'Sid':ticket_dict['wxsid'],         # 版本不同wxsid名称也许有变化
            'Skey':ticket_dict['skey'],
            'Uin':ticket_dict['wxuin'],
        },
        'SyncKey':sync_key,
        'rr':'-1928629916',
    }
    msg_response = requests.post(get_msg_url,json=post_data,cookies=cookies)
    msg_response.encoding = 'utf-8'
    msg_dict = json.loads(msg_response.text)
    print(response.text)
    if 'window.synccheck={retcode:"0",selector:"2"}' in response.text:
        # 获取最新消息
       for row in msg_dict['AddMsgList']:
            print(row.get('Content'))
    else:
        print('-'*200)
    req.session['sync_key'] = msg_dict['SyncKey']
    return HttpResponse('...')
#
def api(request):
    """
    把该url用作api,如服务器报警,
    延伸:只能会操作,如回复什么信息,
    服务器可以有相应的操作,如停掉一个服务
    :param request:
    :return:
    """
    name = request.GET.get('name')
    msg = request.GET.get('msg')
# 获取状态码,302显示用户头像
# 获取最近联系人,获取订阅号信息,获取联系人列表,给联系人发送微信,接收联系人发送的微信
  1. login.html
<body>
<img id="img" style="width: 200px;height: 200px" src="https://login.weixin.qq.com/qrcode/{{ uuid }}">
<script src="/static/jquery-3.2.1.js"></script>
<script>
    TIP = 1;    {# 进入页面时TIP为1,之后每次长连接返回TIP为0 #}
    $(function () {
        checkLogin();
    });
    function checkLogin() {
        //向后台发送请求,浏览器有同源策略,跨域的问题,所以需要发送给后台,后台在向微信服务器发送请求
        $.ajax({
            url:'/check_login.html/',
            type:'GET',
            data:{'tip':TIP},
            dataType:'JSON',
            success:function (arg) {
                if (arg.code == 408){
                    //如果返回408,表示没有人扫码继续发送长连接
                    if (arg.tip == 1){
                        TIP = 0;
                    }
                    checkLogin();
                }else if(arg.code == 201){
                    //如果返回201,表示手机已经扫码返回头像
                    $('#img').attr('src',arg.data);
                    if (arg.tip == 0){
                        TIP = 1;
                    }else {
                        TIP = 0;
                    }
                    checkLogin();
                }else if(arg.code == 200) {
                    location.href = '/index.html/'
                }
            }
        })
    }
</script>
  1. index.html
<body>
    <h1>个人信息</h1>
    <img src="/avatar.html?prev={{ init_dict.User.HeadImgUrl }}/">
    <h2>{{ init_dict.User.NickName }}</h2>
    <h1>最近联系人</h1>
    <ul>
        {% for user in init_dict.ContactList %}
            {#  看不到头像设置了防倒链,浏览器发送的Referer为127.0.0.1:8000,需求获取refer请求头(wx.qq.com #}
            {#  通过自己的后台方法访问获取头像 #}
            <li><img style="width: 50px;height: 50px;" src="/avatar.html?prev={{ user.HeadImgUrl }}/">  {{ user.NickName }} / 唯一标识:【{{ user.UserName }}】</li>
        {% endfor %}
    </ul>
    <a href="/contact_list.html/">更多联系人</a>
    <h1>公众号信息</h1>
</body>
  1. contact_list.html
<body>
    <div>
        <h3>发送信息</h3>
        <p>接收者:<input type="text" id="to_user"></p>
        <p>消息内容:<input type="text" id="msg"></p>
        <input type="button" value="发送" id="btn">
    </div>
    <div>用户列表</div>
    <h3>个人信息</h3>
    <ul>
        <li>{{ request.session.init_dict.User.NickName }}</li>
        <li>{{ request.session.init_dict.User.UserName }}</li>
    </ul>
    <div>{{ user_list.MemberCount }}</div>
    {% for user in user_list.MemberList %}
        <div username="{{ user.UserNanme }}">
{#            <img style='width: 50px;height: 50px' src="/avatar.html?prev={{ user.HeadImgUrl }}/">#}
            <span>{{ user.NickName }}---->{{ user.UserName }}</span>
        </div>
    {% endfor %}
    <script src="/static/jquery-3.1.1.js"></script>
    <script>
        $(function () {
            bindBtnEvent();
            getMsg();
        });
        function bindBtnEvent() {
            $('#btn').click(function () {
                $.ajax({
                    url:'/send_msg.html',
                    type:'GET',
                    data:{'to_user':$('#to_user').val(),'msg':$('#msg').val()},
                    success:function (arg) {
                        console.log(arg);
                    }
                })
            });
        }
        function getMsg() {
            $.ajax({
                url:'/getMsg.html/',
                type:'GET',
                success:function (arg) {
                    getMsg();
                },
                error:function () {
                    getMsg();
                }
            })
        }
    </script>
</body>
  1. urls.py
from django.conf.urls import url
from django.contrib import admin
from app01 import views
urlpatterns = [
    url(r'^admin/', admin.site.urls),
    url(r'^login.html/$',views.login),
    url(r'^check_login.html/$',views.check_login),
    url(r'index.html/$',views.index),
    url(r'avatar.html/$',views.avatar),
    url(r'contact_list.html/$',views.contact_list),
    url(r'^send_msg.html/$',views.send_msg),
    url(r'^getMsg.html/$',views.getMsg),
    url(r'^api.html/$',views.api),
]

二、高性能异步IO

传统并发。

使用线程池或者进程池,但存在资源浪费的情况。
python2中没有线程池概念。
python3中又有线程也有进程。

from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
import requests
# python2 没有线程池,有进程池
# python3 有线程池,有进程池
def task(url):
    response = requests.get(url)
    print(response.content)
pool = ThreadPoolExecutor(10)       # 线程池
# pool = ProcessPoolExecutor(4)    # 进程池
url_list = ['http://www.baidu.com','http://www.bing.com']
for url in url_list:
    v = pool.submit(task,url)
pool.shutdown(wait=True)

异步非阻塞

非阻塞:不等待
异步:回调
1个线程完成并发操作

  1. async模块
import asyncio
import requests
@asyncio.coroutine
def fetch_async(func, *args):
    loop = asyncio.get_event_loop()
    future = loop.run_in_executor(None, func, *args)
    response = yield from future
    print(response.url, response.content)
tasks = [
    fetch_async(requests.get, 'http://www.cnblogs.com/wupeiqi/'),
    fetch_async(requests.get, 'http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091')
]
loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
  1. gevent模块
import gevent
import requests
from gevent import monkey
monkey.patch_all()
def fetch_async(method, url, req_kwargs):
    print(method, url, req_kwargs)
    response = requests.request(method=method, url=url, **req_kwargs)
    print(response.url, response.content)
# ##### 发送请求 #####
gevent.joinall([
    gevent.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),
    gevent.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),
    gevent.spawn(fetch_async, method='get', url='https://github.com/', req_kwargs={}),
])
# ##### 发送请求(协程池控制最大协程数量) #####
# from gevent.pool import Pool
# pool = Pool(None)
# gevent.joinall([
#     pool.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),
#     pool.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),
#     pool.spawn(fetch_async, method='get', url='https://www.github.com/', req_kwargs={}),
# ])
  1. Twisted模块
from twisted.web.client import getPage, defer
from twisted.internet import reactor
def all_done(arg):
    reactor.stop()
def callback(contents):
    print(contents)
deferred_list = []
url_list = ['http://www.bing.com', 'http://www.baidu.com', ]
for url in url_list:
    deferred = getPage(bytes(url, encoding='utf8'))
    deferred.addCallback(callback)
    deferred_list.append(deferred)
dlist = defer.DeferredList(deferred_list)
dlist.addBoth(all_done)
reactor.run()
  1. 抓取网站链接,异步IO获取页面html
from bs4 import BeautifulSoup
import requests
import asyncio
import time
import json
html_list = []
@asyncio.coroutine
def fetch_async(func, *args):
    loop = asyncio.get_event_loop()
    future = loop.run_in_executor(None, func, *args)
    response = yield from future
    content = response.content
    content = content.decode('utf-8','ignore')
    html_list.append({response.url:content,})
url = 'http://www.hao123.com'
html = requests.get(url)
html = html.content
html = html.decode('utf-8')
soup = BeautifulSoup(html,'html.parser')
a = soup.find_all(name='a')
tasks = []
print(time.ctime())
for href in a:
    href = str(href.get('href'))
    if href[:4] == 'http':
        task = fetch_async(requests.get,href)
        tasks.append(task)
loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
f = open('abc.txt','a')
for i in html_list:
    f.write(json.dumps(i)+"\n")
f.close()
print(time.ctime())

高性能先自定义异步IO模块示例

IO多路复用:
select,用于检测socket对象是否发生变化(是否连接成功,是否有数据到来),变化后扫描函数中是否有响应的操作
它通过一个select()系统调用来监视多个文件描述符的数组,当select()返回后,该数组中就绪的文件描述符便会被内核修改标志位,使得进程可以获得这些文件描述符从而进行后续的读写操作。
Socket:
在socket客户端
使用一个线程完成并发操作,
当第一个任务到来时,
先发送连接请求,
此时发生IO等待,
但是不等待,
继续发送第二个任务的连接请求...
IO多路复用监听socket变化
先连接成功:
发送请求信息: GET / http/1.0\r\nhost...
遇到IO等待,不等待,继续监测是否有人连接成功:
有结果返回:
处理结果,读取返回的内容,执行回调函数

# s1.py
import socket
import select
class Request(object):
    def __init__(self,sock,func,url):
        self.sock = sock
        self.func = func
        self.url = url
    def fileno(self):
        return self.sock.fileno()
def async_request(url_list):
    input_list = []
    conn_list = []
    for url in url_list:
        client = socket.socket()
        client.setblocking(False)                # 使socket连接非阻塞,会报错
        try:
            client.connect((url[0],80,))   		 # 创建连接,默认阻塞
        except BlockingIOError as e:
            pass
        obj = Request(client,url[1],url[0])
        input_list.append(obj)
        conn_list.append(obj)
    while True:
        # 监听socket是否发生变化
        # 如果有其你去连接成功:wlist = [client,]
        # 如果有响应的数据:rlist = [client,]
        rlist,wlist,elist = select.select(input_list,conn_list,[],0.05)
        # print(rlist)
        # print(wlist)
        for request_obj in wlist:  # 连接成功发消息
            # print(conn)
            # print('连接成功')
            # print('发送请求')
            request_obj.sock.sendall("GET / HTTP/1.1\r\nhost:{0}\r\n\r\n".format(request_obj.url).encode('utf-8'))
            conn_list.remove(request_obj)
        for request_obj in rlist:  # 发送后返回的消息
            # print('接收到返回值')
            data = request_obj.sock.recv(8096)
            request_obj.func(data)
            request_obj.sock.close()
            input_list.remove(request_obj)
        if not input_list:
            break
        # 发送Http请求
    # client.sendall(b"GET / HTTP/1.0\r\nhost:www.baidu.com\r\n\r\n")
    # data = client.recv(8096)    # 获取响应,假设可接收所有返回信息
    client.close()
# s2.py
import s1
def callback1(data):
    print(data)
def callback2(data):
    print(data)
input_list = []
conn_list = []
url_list = [
    ['www.baidu.com',callback1],
    ['www.cnblog.com',callback2],
]
s1.async_request(url_list)

协程

协程定义:
单纯的执行一段代码后切换,调到另外一段代码执行,在继续跳...。
异步IO:
[基于协程]可以用协程+非阻塞socket(gevent)

[基于事件循环]完全同过socket+select(IO多路复用)实现(Twsited,tornado)

  1. 如何提高爬虫并发
    利用异步IO模块,如:asyncio,twisted,gevent
    本质:基于协程,或基于事件循环

  2. 异步非阻塞
    异步:回调 select
    费阻塞:不等待 setblocking(False)

  3. 什么是协程?

from greenlet import greenlet
def test1():
    print(12)
    gr2.swich()
    print(34)
    gr2.swich()
def test2():
    print(56)
    gr1.swich()
    print(78)
gr1 = greenlet(test1)
gr2 = greenlet(test2)
gr1.swich()

三、scrapy框架

脚本类的爬虫可以使用request+bs4+twisted或gevent或asyncio实现

scrapy框架功能

下载页面--->使用twisted
html解析--->使用scrapy内容方法
限速--->伪造认为方式爬取信息
去重--->爬过的url不去爬
递归--->递归爬取连接,并可限制最多爬几层页面
代理--->支持代理
https--->支持代理
中间件--->支持中间件

scrapy框架结构

1.scrpapy组件

  • 引擎(Scrapy)
    用来处理整个系统的数据流处理, 触发事务(框架核心)
  • 调度器(Scheduler)
    用来接受引擎发过来的请求, 压入队列中, 并在引擎再次请求的时候返回. 可以想像成一个URL(抓取网页的网址或者说是链接)的优先队列, 由它来决定下一个要抓取的网址是什么, 同时去除重复的网址
  • 下载器(Downloader)
    用于下载网页内容, 并将网页内容返回给蜘蛛(Scrapy下载器是建立在twisted这个高效的异步模型上的)
  • 爬虫(Spiders)
    爬虫是主要干活的, 用于从特定的网页中提取自己需要的信息, 即所谓的实体(Item)。用户也可以从中提取出链接,让Scrapy继续抓取下一个页面
    项目管道(Pipeline)
    负责处理爬虫从网页中抽取的实体,主要的功能是持久化实体、验证实体的有效性、清除不需要的信息。当页面被爬虫解析后,将被发送到项目管道,并经过几个特定的次序处理数据。
  • 下载器中间件(Downloader Middlewares)
    位于Scrapy引擎和下载器之间的框架,主要是处理Scrapy引擎与下载器之间的请求及响应。
  • 爬虫中间件(Spider Middlewares)
    介于Scrapy引擎和爬虫之间的框架,主要工作是处理蜘蛛的响应输入和请求输出。
  • 调度中间件(Scheduler Middewares)
    介于Scrapy引擎和调度之间的中间件,从Scrapy引擎发送到调度的请求和响应。
  1. Scrapy运行流程
    引擎从调度器中取出一个链接(URL)用于接下来的抓取
    引擎把URL封装成一个请求(Request)传给下载器
    下载器把资源下载下来,并封装成应答包(Response)
    爬虫解析Response
    解析出实体(Item),则交给实体管道进行进一步的处理
    解析出的是链接(URL),则把URL交给调度器等待抓取
Spiders(爬虫):
看做是一个类,其中有url,和自定义的回调函数
会有一个,更多的是多个Spiders依次执行
如果返回数据需要继续操作(如访问下一级页面获取数据),则重复开头动作
如果返回得到需求结果,则交给item Pipeline做数据持久化
Scarpy Engine将URL请求放在Scheduler任务调度器中
Scheduler(调度器):
Scheduler中包含任务队列,每个任务为一个对象其中包括请求URL和回调函数名
Downloader(下载器):
Downloader去Scheduler里取任务下载,利用twisted去远程发送请求,返回结果
作为参数,传给Spiders执行回调函数。
item Pipeline(数据持久化):
回调函数执行成功的结果,传递给item Pipeline做数据持久化
item Pipeline里定义数据持久化的操作

安装scrapy

Linux
      pip3 install scrapy
Windows
      a. pip3 install wheel
      b. 下载twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
      c. 进入下载目录,执行 pip3 install Twisted‑17.1.0‑cp35‑cp35m‑win_amd64.whl
      d. pip3 install scrapy
      e. 下载并安装pywin32:https://sourceforge.net/projects/pywin32/files/

基本操作

  1. scrapy startproject 项目名称

    • 在当前目录中创建中创建一个项目文件(类似于Django)
  2. scrapy genspider [-t template]

    • 创建爬虫应用
      如:
      scrapy gensipider -t basic oldboy oldboy.com
      scrapy gensipider -t xmlfeed autohome autohome.com.cn
      PS:
      查看所有命令:scrapy gensipider -l
      查看模板命令:scrapy gensipider -d 模板名称
  3. scrapy list

    • 展示爬虫应用列表
  4. scrapy crawl 爬虫应用名称

    • 运行单独爬虫应用

scrapy项目结构

project_name/
   scrapy.cfg
   project_name/
       __init__.py
       items.py			# 和pipline共同做持久化,指定规则
	   middlewares.py	# 中间件
       pipelines.py		# 持久化
       settings.py		# 配置文件
       spiders/			# 具体爬虫内容
           __init__.py
           爬虫1.py
           爬虫2.py
           爬虫3.py
文件说明:
    scrapy.cfg  项目的主配置信息。(真正爬虫相关的配置信息在settings.py文件中)
    items.py    设置数据存储模板,用于结构化数据,如:Django的Model
    pipelines    数据处理行为,如:一般结构化的数据持久化
    settings.py 配置文件,如:递归的层数、并发数,延迟下载等
    spiders      爬虫目录,如:创建文件,编写爬虫规则

爬虫文件

# chouti.py
import scrapy
import io,os,sys
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')	# gb18030可以是亚洲多种文字的编码
class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
    start_urls = ['http://chouti.com/']
#
    def parse(self, response):
        print(response.text)
#		print(response.body)
# cmd : scrapy crawl chouti --nolog

选择器

#!/usr/bin/env python
# -*- coding:utf-8 -*-
from scrapy.selector import Selector, HtmlXPathSelector
from scrapy.http import HtmlResponse
html = """<!DOCTYPE html>
<html>
    <head lang="en">
        <meta charset="UTF-8">
        <title></title>
    </head>
    <body>
        <ul>
            <li class="item-"><a id='i1' href="link.html">first item</a></li>
            <li class="item-0"><a id='i2' href="llink.html">first item</a></li>
            <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
        </ul>
        <div><a href="llink2.html">second item</a></div>
    </body>
</html>
"""
response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8')
# hxs = HtmlXPathSelector(response)
# print(hxs)
# hxs = Selector(response=response).xpath('//a')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[2]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@id]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@id="i1"]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[contains(@href, "link")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first()
# print(hxs)
#
# ul_list = Selector(response=response).xpath('//body/ul/li')
# for item in ul_list:
#     v = item.xpath('./a/span')
#     # 或
#     # v = item.xpath('a/span')
#     # 或
#     # v = item.xpath('*/a/span')
#     print(v)

pipeline(持续存储)

# /sp1/sp1/spiders/chouti.py
import scrapy
import io,os,sys
from scrapy import Request
from scrapy.selector import HtmlXPathSelector   # 等同于bs4模块
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
from ..items import Sp1Item
class ChoutiSpider(scrapy.Spider):
    name = 'chouti'                             # 执行爬虫时,根据名称爬取,没有程序报错
    allowed_domains = ['chouti.com']            # 其他url不许爬取
    start_urls = ['http://dig.chouti.com/']     # 起始url写多少个都行,默认执行parse方法,他的回调函数是parse
    # def start_requests(self):                 # 注释start_urls,可自定义请求头,cookie等
    #     yield Request(url='http://dig.chouti.com/',headers={},callback=self.parse)
    def parse(self, response):
        # print(response.text)
        hxs = HtmlXPathSelector(response)   # 生成hxs对象,用作匹配条件
        item_list = hxs.select('//div[@id="content-list"]/div[@class="item"]')
        # print(item_list)
        for item in item_list:
            # item.select('./div[@class="new-content"]/div[@class="part2"]/text()')
            # title = item.select('./div[@class="news-content"]/div[@class="part2"]/@share-title').extract()
            url = item.select('./div[@class="news-content"]/div[@class="part2"]/@share-pic').extract_first()
            title = item.select('./div[@class="news-content"]/div[@class="part2"]/@share-title').extract_first()
            # print(v)
            obj = Sp1Item(title=title,url=url)
            yield obj			# yield Item对象会自动将值转交给pipeline处理
#
# /sp1/sp1/items.py
import scrapy
class Sp1Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()	# 定义Item
    url = scrapy.Field()	# 定义Item
#
# /sp1/sp1/pipelines.py
class Sp1Pipeline(object):
    def __init__(self,file_path):
        self.file_path = file_path
        self.file_obj = None
    @classmethod
    def from_crawler(cls,crawler):
        """
        初始化时候,用于创建pipeline对象
        :param crawler:
        :return:返回对象
        """
        val = crawler.settings.get('XXX')		# 配置文件settings获取配置
        return cls(val)
    def process_item(self, item, spider):
        self.file_obj.write(item['url']+'\r\n')
        print('pipline-->',item)
        return item
    def open_spider(self,spider):
        """
        爬虫开始执行时,只执行一次
        :param spider:
        :return:
        """
        self.file_obj = open(self.file_path,mode='a+')	# 爬虫开始时打开文件
    def close_spider(self,spider):
        """
        爬虫关闭时,只执行一次
        :param spider:
        :return:
        """
        self.file_obj.close()							# 爬虫结束时关闭文件
#
# settings.py
# 解开注释才可以使用pipeline,300为优先级,可以有多个pipeline
ITEM_PIPELINES = {
   'sp1.pipelines.Sp1Pipeline': 300,
}
XXX = "data.json"	# 自定义

递归执行爬虫

# /sp1/sp1/spiders/chouti.py
import scrapy
import io,os,sys
from scrapy.selector import HtmlXPathSelector
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
from ..items import Sp1Item
class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
    start_urls = ['http://dig.chouti.com/']
    def parse(self, response):
        # print(response.text)
        hxs = HtmlXPathSelector(response)
        item_list = hxs.select('//div[@id="content-list"]/div[@class="item"]')
        # print(item_list)
        for item in item_list:
            # item.select('./div[@class="new-content"]/div[@class="part2"]/text()')
            # title = item.select('./div[@class="news-content"]/div[@class="part2"]/@share-title').extract()
            url = item.select('./div[@class="news-content"]/div[@class="part2"]/@share-pic').extract_first()
            title = item.select('./div[@class="news-content"]/div[@class="part2"]/@share-title').extract_first()
            # print(v)
            obj = Sp1Item(title=title,url=url)
            yield obj
        # 找到所有页码标签url,并继续递归执行parse
        # hxs.select('//div[@id="dig_lcpage"]//a/@href').extract
        page_url_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href,"/all/hot/recent/\d+")]/@href').extract()
        for url in page_url_list:
            url = 'http://dig.chouti.com' + url
            obj = Request(url=url,callback=self.parse)  # 将url和回调函数封装,Request如需加请求头,也可加参数headers={},或cookies
            yield obj                                   # 遇到request对象,yield自动放入调度器中Scheduler
#
# settings.py
DEPTH_LIMIT = 1     		# 限制递归爬虫的层数,多url的限制数量哪怕是10,结果数量也会非常庞大
ROBOTSTXT_OBEY = True		# 是否遵循爬虫协议,不遵循改为False

posted on 2017-11-01 15:47  运维小学生  阅读(144)  评论(0编辑  收藏  举报

导航