python3使用requests爬取新浪热门微博

微博登录的实现代码来源:https://gist.github.com/mrluanma/3621775


相关环境

使用的python3.4,发现配置好环境后可以直接使用pip easy_install命令安装第三方库,比如本示例需要依赖的库:

pip install requests
pip install rsa

代码实现

以下代码主要是登录成功后,爬取热闹微博的TOP 100,再保存到hotweb.html文件里边

import re
import json
import urllib.parse
import base64
import binascii
import json
 
import rsa
import requests
import logging

from pprint import pprint 

wbdom = r'd:\pyzone\hotwb.html';
weclient = 'ssologin.js(v1.4.5)'
FORMAT = '%(asctime)-15s %(message)s'
user_agent = (
    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.11 (KHTML, like Gecko) '
    'Chrome/20.0.1132.57 Safari/536.11'
)

logging.basicConfig(level=logging.DEBUG, format=FORMAT)
logger = logging.getLogger('weibo')
session = requests.session()
session.headers['User-Agent'] = user_agent
 
 
def encrypt_passwd(passwd, pubkey, servertime, nonce):
    key = rsa.PublicKey(int(pubkey, 16), int('10001', 16))
    message = str(servertime) + '\t' + str(nonce) + '\n' + str(passwd)
    passwd = rsa.encrypt(message.encode(), key)
    return binascii.b2a_hex(passwd)
 
 
def wblogin(username, password):
    resp = session.get(
        'http://login.sina.com.cn/sso/prelogin.php?'
        'entry=sso&callback=sinaSSOController.preloginCallBack&'
        'su=%s&rsakt=mod&client=%s' %
        (base64.b64encode(username), weclient)
    )
 
    pre_login_str = re.match(r'[^{]+({.+?})', resp.content.decode('gbk')).group(1)
    pre_login = json.loads(pre_login_str)
 
    pre_login = json.loads(pre_login_str)
    data = {
        'entry': 'weibo',
        'gateway': 1,
        'from': '',
        'savestate': 7,
        'userticket': 1,
        'ssosimplelogin': 1,
        'su': base64.b64encode(urllib.parse.quote(username).encode()),
        'service': 'miniblog',
        'servertime': pre_login['servertime'],
        'nonce': pre_login['nonce'],
        'vsnf': 1,
        'vsnval': '',
        'pwencode': 'rsa2',
        'sp': encrypt_passwd(password, pre_login['pubkey'],
                             pre_login['servertime'], pre_login['nonce']),
        'rsakv' : pre_login['rsakv'],
        'encoding': 'gbk',
        'prelt': '115',
        'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.si'
               'naSSOController.feedBackUrlCallBack',
        'returntype': 'META'
    }
    resp = session.post(
        'http://login.sina.com.cn/sso/login.php?client=%s' % weclient,
        data=data
    )
 
    login_url = re.search(r'replace\([\"\']([^\'\"]+)[\"\']',
                          resp.content.decode('gbk')).group(1)
    resp = session.get(login_url)
    login_str = re.match(r'[^{]+({.+?}})', resp.content.decode('gbk'))
    if(login_str):      # result is not None
        logger.info('login success..')
        login_str = json.loads(login_str.group(1))
        pprint(login_str)
        return True
    else:
        logger.info('login fail..')
        return False

def gethotwb(url):
    f = open(wbdom, mode='a', encoding='utf-8')
    for x in range(1,11):       # page 1 to 10
        r = session.get(url + str(x))
        r.encoding = 'utf-8'
        f.write('\n<p>--------page:'+ str(x) +'---------</p>\n\n')
        f.write(json.loads(r.text)['data']['html'])
    f.close()
 
if __name__ == '__main__':
    flag = wblogin(b'xx@163.com', 'xx')
    if(flag):
        gethotwb('http://hot.weibo.com/ajax/feed?type=h&v=9999&page=');

总结

  1. 测试的过程中连接了FQ的VPN,异地登录需要验证码,此时retcode=4049,登录成功是0
  2. python各个版本之间不兼容好蛋痛

大家中秋快乐!

参考文档

requests文档 http://docs.python-requests.org/zh_CN/latest/

微博登录过程分析 http://www.cnblogs.com/pzxbc/archive/2012/02/03/2335027.html

posted @ 2014-10-01 22:31  liaoyu  阅读(2515)  评论(0编辑  收藏  举报