通过Appium+mitmproxy爬取快手短视频,并将爬取信息存入mongodb数据库
一、目标
通过appium模拟滑动快手发现页中的视频,通过mitmproxy代理抓取视频信息,存入mongodb数据库中
二、步骤
-
通过fiddler抓包工具分析接口
- 先通过fiddler抓包工具,分析并获取到视频数据的api,发现视频信息api中包含“rest/n/feed/hot”。然后将返回的json数据通过json.cn网站进行打开分析,解析想要的数据,并编写mitmproxy的脚本,存入mongodb数据库。
-
编写mitmproxy脚本
# coding:utf-8 import pymongo import json #mongodb数据库连接 client = pymongo.MongoClient(host='主机名',port=27017) db = client['kuaishou'] collection = db['video_info'] def response(flow): if 'rest/n/feed/hot' in flow.request.url: info_dict = json.loads(flow.response.text) infos = info_dict.get('feeds') for info in infos: video_info = {} video_info['user_id'] = info['user_id'] #用户id video_info['user_name'] = info['user_name'].strip()#用户名 video_info['title'] = info['caption'] #标题 video_info['video_url'] = info['main_mv_urls'][0]['url'] #视频地址 video_info['duration'] = int(info['duration']/1000) #视频时长 video_info['view_count'] = info['view_count'] #观看数 video_info['share_count'] = info['share_count'] #分享数 video_info['comment_count'] = info['comment_count'] #评论数 video_info['like_count'] = info['like_count'] #喜欢数 video_info['unlike_count'] = info['unlike_count'] #不喜欢数 video_info['share_info'] = info['share_info'] #分享信息 collection.replace_one({'video_url':video_info['video_url']},video_info,True) #存入数据库,有则替换,没有则插入
-
编写爬虫脚本
# coding:utf-8 import time from appium.webdriver import Remote from selenium.webdriver.support.ui import WebDriverWait as WAIT # desired_capabilities cap = { "platformName": "Android", "platformVersion": "5.1.1", "deviceName": "127.0.0.1:62001", "appPackage": "com.smile.gifmaker", "appActivity": "com.yxcorp.gifshow.HomeActivity", "noReset": True, "unicodeKeyboard": True, "keyboardReset": True } def get_size(driver): '''获取页面窗口大小''' size = driver.get_window_size() return size['width'],size['height'] driver = Remote('http://127.0.0.1:4723/wd/hub',desired_capabilities=cap) #appium客户端连接 #点击弹窗---青少年模式下的我知道了 try: i_know = WAIT(driver, 400).until(lambda x:x.find_element_by_android_uiautomator('new UiSelector().className(\"android.widget.TextView\").textContains(\"我知道了\").resourceId(\"com.smile.gifmaker:id/positive\")')) i_know.click() except: pass time.sleep(2) size = get_size(driver) #获取快手界面大小 #滑动的起止位置,从中间下方80%的位置滑动到上方20%的位置 x = int(size[0]*0.5) y_start = int(size[1]*0.8) y_end = int(size[1]*0.2) #模拟滑动20次 for i in range(20): driver.swipe(x,y_start,x,y_end,200) #滑动时间200 ms time.sleep(1)
-
写个脚本下载视频
# coding:utf-8 import requests import pymongo import os import time import re headers = { 'UserAgent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } client = pymongo.MongoClient(host='主机名',port=27017) db = client['kuaishou'] #指定数据库名 collection = db['video_info'] #指定集合名 #视频存储位置 if not os.path.exists('./videos'): os.mkdir('./videos') video_infos = collection.find({}) #返回的是一个iterator for video_info in video_infos: video_url = video_info['video_url'] video_name = re.search(r'clientCacheKey=(.*?\.mp4)',video_url).group(1) data = requests.get(video_url,headers=headers).content with open('./videos/'+video_name,'wb') as f: f.write(data) time.sleep(1)