[Crawler] Get the real file link of BaiduYun shared by user with Chrome
Get the real file link of BaiduYun shared by user.
with Python 2.7 + Selenium + Chrome driver
We finally got a viable approach after several unsatisfactory attempts and one among them is:
-
Here still have a lot work to improve the performance, implement threading for example.
-
History is a sqlite database file which can be easily parse by sqlite3 module. You can browse the data with DB Browser for SQLite
# -*- coding: utf-8 -*-
#----------------------------
# Author: Kun Liu
# Start date: 2017-03-10
# Latest edit: 2017-03-13
# Email: lancelotdev@163.com
#=============================
# Read baiduyun file links from chrome history file
"""
### 解决方案:
1. 制定user data目录,通过 selenium 模拟 chrome 浏览器创建下载任务,但并不完成下载。
2. 解析 userdata 中的 History 获取真实资源链接。
### Note:
1. 未做资源链接去重处理。
2. 存在多次访问后出现的验证问题,待研究。
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.alert import Alert
from FileItem import FileItem
user_data_dir_path = "d://userData"
options = webdriver.ChromeOptions()
options.add_argument("user-data-dir=%s"%user_data_dir_path)
# Travel all share url to get history.
def baiduyun_url_travel(share_url_list=[]):
driver = webdriver.Chrome(chrome_options=options)
if not share_url_list:
return
# Init the user data such as cookie so you won't need to request a url twice.
driver.get(share_url_list[0])
for url in share_url_list:
driver.get(url)
time.sleep(3)
js_str = "Object.defineProperty(Object.getPrototypeOf(navigator),'platform',{get:function(){return 'sb_baidu';}})"
driver.execute_script(js_str)
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="layoutMain"]/div[1]/div[1]/div/div[2]/div/div/div[2]/a[2]'))
)
except Exception as e:
element = driver.find_element_by_xpath('//*[@id="layoutMain"]/div[1]/div[1]/div/div[2]/div/div/div[2]/a[2]')
finally:
element.click()
time.sleep(5)
driver.quit()
# 2017-03-13 Liu Kun
# The 'History' file is a sqlite database.
# Some download links may jump to other urls which is clearly marked by Chrome
# and here I use the direct link without jumping.
def get_source_link_from_history(History_path):
import sqlite3 as db
conn = db.connect(History_path)
cursor = conn.cursor()
sql = "select id, chain_index, url from downloads_url_chains where chain_index=0"
rows = cursor.execute(sql).fetchall()
items = []
for row in rows:
id, _, file_link = row
sql = "select current_path, start_time from downloads where id=%d"%int(id)
file_info = cursor.execute(sql).fetchone()
if file_info:
current_path, time_stamp = file_info
time_stamp = str(time_stamp)
# C:\Users\kun_liu\Downloads\shadowsocks-nightly-3.2.7.apk.crdownload
file_name = current_path.split('\\')[-1].replace('.crdownload','')
x = time.localtime(int(time_stamp[0:10]))
# time.strptime(a,'%Y-%m-%d %H:%M:%S')
start_time = time.strftime('%Y-%m-%d %H:%M:%S',x)
item = FileItem(file_name, file_link, start_time)
items.append(item.make_dic())
return items
if __name__ == "__main__":
# Movie:https://pan.baidu.com/s/1sl8litZ #App:https://pan.baidu.com/s/1o8K255K
share_url = ["https://pan.baidu.com/s/1sl8litZ", "https://pan.baidu.com/s/1dFBr37F", "https://pan.baidu.com/s/1o8K255K"]
baiduyun_url_travel(share_url)
History_path = os.path.join(user_data_dir_path, "Default", "History")
items = get_source_link_from_history(History_path)
import pprint
pprint.pprint(items)
FileItem.py:
# -*- coding: utf-8 -*-
#----------------------------
# Author: Kun Liu
# Start date: 2017-03-13
# Latest edit: 2017-03-13
#=============================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import pprint
class FileItem:
def __init__(self, file_name="", file_link="", catch_time= ""):
self.file_name = file_name
self.file_link = file_link
self.file_time = catch_time
def make_dic(self):
info_dic = {"file_name":self.file_name, "link":self.file_link, "time":self.file_time}
return info_dic
if __name__ == "__main__":
pass