通过Appium+mitmproxy爬取快手短视频,并将爬取信息存入mongodb数据库

一、目标

通过appium模拟滑动快手发现页中的视频,通过mitmproxy代理抓取视频信息,存入mongodb数据库中

二、步骤

  • 通过fiddler抓包工具分析接口

    • 先通过fiddler抓包工具,分析并获取到视频数据的api,发现视频信息api中包含“rest/n/feed/hot”。然后将返回的json数据通过json.cn网站进行打开分析,解析想要的数据,并编写mitmproxy的脚本,存入mongodb数据库。
  • 编写mitmproxy脚本

    # coding:utf-8
    import pymongo
    import json
    
    #mongodb数据库连接
    client = pymongo.MongoClient(host='主机名',port=27017)
    db = client['kuaishou']
    collection = db['video_info']
    
    def response(flow):
        if 'rest/n/feed/hot' in flow.request.url:
            info_dict = json.loads(flow.response.text)
            infos = info_dict.get('feeds')
            for info in infos:
                video_info = {}
                video_info['user_id'] = info['user_id'] #用户id
                video_info['user_name'] = info['user_name'].strip()#用户名
                video_info['title'] = info['caption'] #标题
                video_info['video_url'] = info['main_mv_urls'][0]['url'] #视频地址
                video_info['duration'] = int(info['duration']/1000) #视频时长
                video_info['view_count'] = info['view_count'] #观看数
                video_info['share_count'] = info['share_count'] #分享数
                video_info['comment_count'] = info['comment_count'] #评论数
                video_info['like_count'] = info['like_count'] #喜欢数
                video_info['unlike_count'] = info['unlike_count'] #不喜欢数
                video_info['share_info'] = info['share_info'] #分享信息
                collection.replace_one({'video_url':video_info['video_url']},video_info,True) #存入数据库,有则替换,没有则插入
  • 编写爬虫脚本

    # coding:utf-8
    import time
    from appium.webdriver import Remote
    from selenium.webdriver.support.ui import WebDriverWait as WAIT
    
    # desired_capabilities
    cap = {
      "platformName": "Android",
      "platformVersion": "5.1.1",
      "deviceName": "127.0.0.1:62001",
      "appPackage": "com.smile.gifmaker",
      "appActivity": "com.yxcorp.gifshow.HomeActivity",
      "noReset": True,
      "unicodeKeyboard": True,
      "keyboardReset": True
    }
    
    def get_size(driver):
        '''获取页面窗口大小'''
        size = driver.get_window_size()
        return size['width'],size['height']
    
    
    driver = Remote('http://127.0.0.1:4723/wd/hub',desired_capabilities=cap) #appium客户端连接
    
    
    #点击弹窗---青少年模式下的我知道了
    try:
        i_know = WAIT(driver, 400).until(lambda x:x.find_element_by_android_uiautomator('new UiSelector().className(\"android.widget.TextView\").textContains(\"我知道了\").resourceId(\"com.smile.gifmaker:id/positive\")'))
        i_know.click()
    except:
        pass
    
    
    time.sleep(2)
    size = get_size(driver) #获取快手界面大小
    
    #滑动的起止位置,从中间下方80%的位置滑动到上方20%的位置
    x = int(size[0]*0.5)
    y_start = int(size[1]*0.8)
    y_end = int(size[1]*0.2)
    
    #模拟滑动20次
    for i in range(20):
        driver.swipe(x,y_start,x,y_end,200) #滑动时间200 ms
        time.sleep(1)
  • 写个脚本下载视频

    # coding:utf-8
    import requests
    import pymongo
    import os
    import time
    import re
    
    headers = {
        'UserAgent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    client = pymongo.MongoClient(host='主机名',port=27017)
    db = client['kuaishou'] #指定数据库名
    collection = db['video_info'] #指定集合名
    
    #视频存储位置
    if not os.path.exists('./videos'):
        os.mkdir('./videos')
    
    video_infos = collection.find({}) #返回的是一个iterator
    
    for video_info in video_infos:
        video_url = video_info['video_url']
        video_name = re.search(r'clientCacheKey=(.*?\.mp4)',video_url).group(1)
        data = requests.get(video_url,headers=headers).content
        with open('./videos/'+video_name,'wb') as f:
            f.write(data)
        time.sleep(1)

     

posted @ 2021-08-06 11:13  eliwang  阅读(468)  评论(0编辑  收藏  举报