一、模拟登录的意义

  当我们需要爬取的数据是登录之后的个人信息, 就需要使用模拟登录

二、使用打码平台处理验证码

  云打码

  打码兔 

三、注册:

    普通用户注册

      充值题分(1块钱)

    开发者用户注册

      添加软件下载调用示例

    填写相关信息

      用户名

      密码

      软件名称

软件密钥

四、调用实例

import http.client, mimetypes, urllib, json, time, requests
from get_img_code import YDMHttp


def get_code(types, filename):
    # 用户名(普通用户的用户名)
    username    = 'lxh66685'

    # 密码
    password    = 'nihao123!'                            

    # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
    appid       = 7971                                     

    # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
    appkey      = 'b6fef487706d29041c20e6f9da220669'    

    # 图片文件
    filename    = filename

    # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
    codetype    = types

    # 超时时间,秒
    timeout     = 30

    # 检查
    if (username == 'username'):
        print('请设置好相关参数再测试')
    else:
        # 初始化
        yundama = YDMHttp(username, password, appid, appkey)

        # 登陆云打码
        uid = yundama.login();
        print('uid: %s' % uid)

        # 查询余额
        balance = yundama.balance();
        print('balance: %s' % balance)

        # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
        cid, result = yundama.decode(filename, codetype, timeout);
        print('cid: %s, result: %s' % (cid, result))
        
        return result

五、模拟登录古诗文网

# 模拟登录古诗文网www.gushiwen.com
import requests
from lxml import etree
from urllib import request

# 将requests请求替换成session请求,可以自动处理cookie
requests = requests.Session()

url = "https://www.gushiwen.com/main/login.html"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
# 1. 向页面首页发送请求,下载验证码图片到本地
page_text = requests.get(url=url, headers=headers).text

tree = etree.HTML(page_text)
img_url = "https://www.gushiwen.com/" + tree.xpath('//div[@class="lg_content"]/ul/li[3]/img/@src')[0]

# request.urlretrieve(img_url, './code.jpg')
page_content = requests.get(url=img_url, headers=headers).content

with open('./code.jpg', 'wb') as f:
    f.write(page_content)


# 2. 识别验证码图片
code = get_code(1004, './code.jpg')
print(code)

data= {
    "user": "lxh661314",
    "pass": "nihao123!",
    "imgvc": code
}

# 3. 实现模拟登录
response = requests.post(url=url, headers=headers, data=data)

login_page_text = requests.get(url="https://www.gushiwen.com/u.html", headers=headers).text

with open('./login.html', 'w', encoding="utf-8") as f:
    f.write(login_page_text)

六、登录古诗文

# 登录古诗文网
import requests
from lxml import etree

requests = requests.Session()

home_url = "https://so.gushiwen.org/user/login.aspx"
login_url = "https://so.gushiwen.org/user/login.aspx"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}

page_text = requests.get(url=home_url, headers=headers).text

tree = etree.HTML(page_text)
view_state = tree.xpath('//input[@id="__VIEWSTATE"]/@value')[0]
view_state_generator = tree.xpath('//input[@name="__VIEWSTATEGENERATOR"]/@value')[0]
img_code_url = "https://so.gushiwen.org" + tree.xpath('//img[@id="imgCode"]/@src')[0]

print(view_state)

page_content = requests.get(url=img_code_url, headers=headers).content

with open('./code.png', 'wb') as f:
    f.write(page_content)

code = get_code(1004, './code.png')

data = {
    "__VIEWSTATE": view_state,
    "__VIEWSTATEGENERATOR": view_state_generator,
    "from": "",
    "email": "lxh661314@163.com",
    "pwd": "nihao123!",
    "code": code,
    "denglu": "登录",
}

# 模拟登录
page_text = requests.post(url=login_url, headers=headers, data=data).text

with open('./gushiwen.html', 'w', encoding='utf-8') as f:
    f.write(page_text)

# 1. 模拟登录的请求参数有些是动态的, 需要从登录页面源码中动态解析获取
# 2. 下载图片最好是使用requests模块来下载,它是会带上UA信息
# 3. 使用Session()自动携带cookie请求