移动端数据爬取和12306模拟登陆

移动端数据的抓取

- 抓包工具

  - fiddler

  - 青花瓷

  - miteproxy

- 使用fiddler,环境的搭建

  1. 对fiddler进行配置:tools  ->  options  ->  connections  ->  Allow remote等  ->  设置端口

  2. 测试端口是否可用:在pc浏览器中访问:本地ip+指定端口

  3. pc端开启热点,使用手机连接热点(保证手机和fiddler所在的pc是在同一网段)

  4. 在手机浏览器中访问那个ip+指定端口,并且安装证书并且信任证书

  5. 在手机中设置代理(在手机连接wifi中设置代理ip和端口 -- 都是基于fiddler)

import requests

headers = {
    'User-Agent': 'qu tou tiao/3.6.1 (iPhone; iOS 12.3.1; Scale/3.00)/qukan_ios'
}

url = "fiddler中抓到的url"

# verify=False可以忽略证书错误
json_data = requests.get(url=url, headers=headers, verify=False).json()
print(json_data)

 爬取抖音短视频:

  滑动到某一个视频,进入作者所有视频的主页,抓取主页中所有视频的数据(V9)

  1. 在fiddler中需要捕获到类似api.amenv开头的数据包(提取url,并且将fiddler中所有请求头信息携带对该url发起请求)

  2. 相应回来的是一组json数据,在json数据中查找带有v9或者mp4结尾的视频信息(需要自己分析)

  3. 有了对应视频的url就可以获取数据并保存。

selenium模拟登陆QQ空间

from selenium import webdriver
from time import sleep

bro = webdriver.Chrome(executable_path='chromedriver.exe')

url = 'https://qzone.qq.com'
bro.get(url)

bro.switch_to.frame('login_frame')
a_tag = bro.find_element_by_id('switcher_plogin')
a_tag.click()

bro.find_element_by_id('u').send_keys('1327703464')
bro.find_element_by_id('p').send_keys('xxxxxxx')
bro.find_element_by_id('login_button').click()

sleep(2)

# 登陆成功后对应的主页
page_text = bro.page_source

无头浏览器

- phantomJs

  无可视化界面的浏览器

 举例:谷歌的无头浏览器:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

bro = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=chrome_options)

bro.get('https://www.baidu.com')
sleep(3)
print(bro.page_source)
bro.save_screenshot('1.png')

bro.quit()

规避监测

- 由于有的网站会对selenium发起的请求进行监测

  - 网站后台可以根据window.navigator,webdriver返回值进行selenium的监测

    - undefinded:不是selenium发起的请求

    - true:是selenium发起的请求

- 规避监测的方法:

  具体看示例(通用的)

from selenium import webdriver
from selenium.webdriver import ChromeOptions

option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
# 实现了规避监测
bro = webdriver.Chrome(executable_path='chromedriver.exe', options=option)
bro.get('https://www.taobao.com/')

12306模拟登陆

所以再次用到超级鹰(chaojiying.py)

import requests
from hashlib import md5


class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password = password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
                          headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()

也用到了pillow模块

from PIL import Image
from time import sleep
from selenium import webdriver
from selenium.webdriver import ActionChains
from chaojiying import Chaojiying_Client


def getCode(imgPath, imgType):
    print(111)
    chaojiying = Chaojiying_Client('超级鹰账号', '密码', '软件id')
    # 打开本地保存的截图信息
    im = open(imgPath, 'rb').read()
    print(222)
    return chaojiying.PostPic(im, imgType)['pic_str']


bro = webdriver.Chrome(executable_path='chromedriver.exe')
url = 'https://kyfw.12306.cn/otn/login/init'
bro.get(url)
sleep(3)

# 截图
bro.save_screenshot('main.png')

img_ele = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
# location表示的是验证码左上角的坐标
location = img_ele.location
print('location', location)
# size返回的是图片的长和宽
size = img_ele.size
print('size', size)
# rangle就是定制好的截图范围
rangle = (
    int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))

# 根据截取范围进行截图
i = Image.open('main.png')
# code_img_name截取下验证码图片的名称
code_img_name = 'code.png'
frame = i.crop(rangle)
frame.save(code_img_name)

# 记录:code.png就是验证码图片,main.png就是当前登录页对应的图片
result = getCode('code.png', 9004)
print(result)

all_list = []
if '|' in result:
    list_1 = result.split('|')
    count_1 = len(list_1)
    for i in range(count_1):
        xy_list = []
        x = int(list_1[i].split(',')[0])
        y = int(list_1[i].split(',')[1])
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
else:
    x = int(result.split(',')[0])
    y = int(result.split(',')[1])
    xy_list = []
    xy_list.append(x)
    xy_list.append(y)
    all_list.append(xy_list)

print(all_list)
# 基于selenium根据all_list进行定点的点击操作
for l in all_list:
    x = l[0]
    y = l[1]
    # perform()是立即执行的意思
    ActionChains(bro).move_to_element_with_offset(img_ele, x, y).click().perform()
    sleep(0.5)

bro.find_element_by_id('username').send_keys('13521980615')
sleep(2)
bro.find_element_by_id('password').send_keys('xxxxxxx')
sleep(2)
bro.find_element_by_id('loginSub').click()
sleep(10)
bro.quit()

 

posted @ 2019-08-11 16:50  blog_wu  阅读(862)  评论(1编辑  收藏  举报