python浏览器抹除webdriver指纹
python网络爬虫抹除webdriver指纹绕过淘宝滑块验证登录
YotaGit
于 2021-04-18 16:08:46 发布
8684
收藏 25
分类专栏: 爬虫 python 文章标签: python 爬虫 selenium
版权
爬虫
同时被 2 个专栏收录
15 篇文章0 订阅
订阅专栏
python
13 篇文章0 订阅
订阅专栏
对于爬虫工程师,最常遇到的需求就是抓取电商网站的数据,那么电商数据最受哪些公司平台的注意呢?以下是几个例子
广告商:广告商透过爬虫定期抓取你店铺的数据,并对店铺包括销量,热度,好评,新上线产品好评度,单个商品好评数的分析,通过层层建模筛选,最终确定你店铺是否遇到了销售瓶颈,那么广告电话就会随之而来
产品设计类:中国的产品设计师往往会遇到思源枯竭的尴尬局面,尤其是服装设计这一行业,那么网络爬虫可以将热度最高的产品呈现在他们眼前,让他们有参考的价值
电商网站:举例像唯品会,名创优品等,他们销售的产品价格往往需要参考同行业的数据,定的太高容易流失客户,这是价格比较爬虫就能很好的解决这一难题
进入正题,那么如何抓取淘宝的电商数据呢?淘宝有哪些验证机制是你需要知道的
1.登录验证机制:解决方案是webdriver模拟登录搭建cookie池
2.js加密机制:webdriver是比较简单的手段,此外还可以破解其js加密(很困难)
3.假数据:淘宝风控系统会对账号浏览的数据做实时的监控,一旦发现你是爬虫,会跳出验证码,如果验证码你也过了,还在不断的抓取数据,那么他可能会封禁你的账号或者抛出假数据(一个专门为爬虫定制的数据系统)让你抓取。
今天我们来讲解webdriver绕过检测,正常情况下我们的测试代码是这样子的
import time
from selenium.webdriverimport Chrome
from selenium.webdriver.chrome.optionsimport Options
from selenium.webdriver.support.waitimport WebDriverWait
chrome_options = Options()
# chrome_options.add_argument("--headless")
# chrome_options.add_argument('disable-infobars') # 正在受到自动化测试软件....
chrome_options.add_argument(
'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36')
driver = Chrome(executable_path='../chromedriver.exe', options=chrome_options)
driver.maximize_window()
try:
driver.get('https://login.taobao.com/member/login.jhtml')
WebDriverWait(driver, 5)
time.sleep(5)
login = driver.find_element_by_id("fm-login-id")
login.send_keys('你的账号')
WebDriverWait(driver, 5)
time.sleep(5)
pwd = driver.find_element_by_id('fm-login-password')
pwd.click()
pwd.send_keys('你的密码')
WebDriverWait(driver, 5)
time.sleep(5)
button = driver.find_element_by_xpath('//button[@class="fm-button fm-submit password-login"]')
WebDriverWait(driver, 5)
button.click()
time.sleep(10)
driver.save_screenshot('taobao.png')
# 你可以保存源代码为 html 再双击打开,查看完整结果
source = driver.page_source
with open('result.html', 'wb')as f:
f.write(source.encode())
except Exception as e:
print(e)
driver.close()
然当出现了滑块验证码
下面我们引用一个js,js引用之前我们来看看webdriver被检测的指纹有哪些
以下是代码
import time
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument(
'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36')
driver = Chrome(executable_path='../chromedriver.exe', options=chrome_options)
driver.get('https://bot.sannysoft.com/')
time.sleep(5)
driver.save_screenshot('sannysoft.png')
source = driver.page_source
with open('result.html', 'w') as f:
f.write(source)
driver.close()
driver.quit()
结果是你已经被检测出是webdriver
下面我们注入该js试试
import time
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument(
'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36')
driver = Chrome(executable_path='../chromedriver.exe', options=chrome_options)
with open('../stealth.min.js') as f:
js = f.read()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": js
})
driver.get('https://bot.sannysoft.com/')
time.sleep(5)
driver.save_screenshot('sannysoft.png')
source = driver.page_source
with open('result.html', 'w') as f:
f.write(source)
driver.close()
driver.quit()
抹除掉了webdriver的指纹信息
下面来正式抓取淘宝的数据,注意淘宝天猫的验证机制是一样的,如果webdriver在输入完密码后有滑块,证明就没有抹除掉指纹信息
import json
import time
import pandas
from selenium.webdriver import Chrome, ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
class BaseSelenium(object):
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_experimental_option('useAutomationExtension', False) # 正在受到自动化测试软件....
chrome_options.add_argument(
'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
)
chrome_options.add_argument("disable-blink-features=AutomationControlled") # 就是这一行告诉chrome去掉了webdriver痕迹
def __init__(self):
self.driver = Chrome(executable_path='../chromedriver.exe', options=self.chrome_options)
self.driver.maximize_window()
with open('../stealth.min.js') as f:
js = f.read()
self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": js
})
class SeleniumTM(BaseSelenium):
def __init__(self):
super(SeleniumTM, self).__init__()
self.keyword_list = [
'python数据分析',
'python人工智能',
'python后端开发',
'python爬虫',
'MySQL',
'redis',
'hadoop',
'spark',
'mongodb',
'前端开发',
]
def login(self):
try:
self.driver.get('https://login.tmall.com/')
WebDriverWait(self.driver, 5)
time.sleep(5)
iframe = self.driver.find_element_by_id("J_loginIframe")
self.driver.switch_to.frame(iframe)
time.sleep(5)
in_put = self.driver.find_element_by_xpath('//input[@id="fm-login-id"]')
in_put.send_keys('你的账号')
WebDriverWait(self.driver, 5)
time.sleep(5)
pwd = self.driver.find_element_by_xpath('//input[@id="fm-login-password"]')
pwd.click()
pwd.send_keys('你的密码')
WebDriverWait(self.driver, 5)
time.sleep(5)
button = self.driver.find_element_by_xpath('//div[@class="fm-btn"]/button')
WebDriverWait(self.driver, 5)
button.click()
time.sleep(10)
self.driver.save_screenshot('login.png')
# 你可以保存源代码为 html 再双击打开,查看完整结果
# source = self.driver.page_source
# with open('result.html', 'wb') as f:
# f.write(source.encode())
except Exception as e:
print(e)
self.close_driver()
def get_track(self, distance): # distance为传入的总距离
# 移动轨迹
track = []
# 当前位移
current = 0
# 减速阈值
mid = distance * 4 / 5
# 计算间隔
t = 0.05
# 初速度
v = 1
while current < distance:
if current < mid:
# 加速度为2
a = 4
else:
# 加速度为-2
a = -3
v0 = v
# 当前速度
v = v0 + a * t
# 移动距离
move = v0 * t + 1 / 2 * a * t * t
# 当前位移
current += move
# 加入轨迹
track.append(round(move))
return track
def close_driver(self):
self.driver.close()
def search_data(self, next_page=None):
for work in self.keyword_list[:]:
mq = self.driver.find_element_by_id('mq')
if mq.is_displayed():
mq.clear()
mq.send_keys(work)
WebDriverWait(self.driver, 5)
time.sleep(2)
button = self.driver.find_element_by_css_selector('#mallSearch > form > fieldset > div > button')
if button.is_displayed():
button.click()
WebDriverWait(self.driver, 5)
time.sleep(2)
self.parse_main()
time.sleep(10)
self.save_data()
def parse_main(self):
item = self.driver.find_element_by_id('J_ItemList')
if item.is_displayed():
products = item.find_elements_by_xpath('./div/div')
total_page = item.find_element_by_xpath('//*[@id="content"]/div/div[8]/div/b[2]/form/input[3]')
if total_page:
total_page = int(total_page.get_attribute('value'))
else:
total_page = 0
data_list = list()
for p in products:
data_dict = dict()
price = p.find_element_by_xpath('./p[@class="productPrice"]/em')
if price.is_displayed():
price = price.get_attribute('title')
data_dict['price'] = price
detail_url = p.find_element_by_xpath('./div[@class="productImg-wrap"]/a')
if detail_url.is_displayed():
detail_url = detail_url.get_attribute('href')
data_dict['detail_url'] = detail_url
title = p.find_element_by_xpath('./p[@class="productTitle"]/a')
if title.is_displayed():
title = title.get_attribute('title')
data_dict['title'] = title
shop_url = p.find_element_by_xpath('./div[@class="productShop"]/a')
if shop_url.is_displayed():
shop_url = shop_url.get_attribute('href')
data_dict['shop_url'] = shop_url
shop_name = p.find_element_by_xpath('./div[@class="productShop"]/a')
if shop_name.is_displayed():
shop_name = shop_name.text
data_dict['shop_name'] = shop_name
month_sale = p.find_element_by_xpath('./p[@class="productStatus"]/span[1]/em')
if month_sale.is_displayed():
month_sale = month_sale.text
data_dict['month_sale'] = "月成交" + month_sale
month_sale = p.find_element_by_xpath('./p[@class="productStatus"]/span[2]/a')
if month_sale.is_displayed():
month_sale = month_sale.text
data_dict['month_sale'] = "评价 " + month_sale
if data_dict:
data_list.append(data_dict)
if data_list:
self.save_json(data_list)
current_page = self.driver.find_element_by_css_selector(
'#content > div > div.ui-page > div > b.ui-page-num > b.ui-page-cur')
if current_page.is_displayed():
current_page = int(current_page.text)
if current_page < total_page:
next_page = self.driver.find_element_by_css_selector(
'#content > div > div.ui-page > div > b.ui-page-num > a.ui-page-next')
if next_page.is_displayed():
next_page.click()
WebDriverWait(self.driver, 5)
time.sleep(5)
self.check_stock()
self.parse_main()
time.sleep(5)
def save_json(self, data):
with open('../tianmao/json.json', 'ab+') as f:
f.write((json.dumps(data, ensure_ascii=False) + '\n').encode())
def save_data(self):
data = list()
with open('../tianmao/json.json', 'rb') as f:
while 1:
item = f.readline()
if item != '' or item != '\n':
data.append(json.loads(item.decode().replace('\n', '')))
continue
break
df = pandas.DataFrame(data)
df.to_excel('TM.xls', index=False, encoding='utf_8_sig')
def has_stock(self):
nc_1_n1z = self.driver.find_element_by_id('nc_1_n1z')
if nc_1_n1z.is_displayed():
return nc_1_n1z
def check_stock(self):
nc_1_n1z = self.has_stock()
if nc_1_n1z:
tracks = self.get_track(400)
ActionChains(self.driver).click_and_hold(nc_1_n1z).perform()
for x in tracks:
ActionChains(self.driver).move_by_offset(xoffset=x, yoffset=0).perform()
time.sleep(0.5)
ActionChains(self.driver).release().perform()
def start_se(self):
self.login()
self.search_data()
self.parse_main()
self.driver.close()
if __name__ == '__main__':
stm = SeleniumTM()
stm.start_se()
```
结果如下
stealth.min.js下载链接:https://github.com/berstend/puppeteer-extra/tree/master/packages/extract-stealth-evasions
————————————————
版权声明:本文为CSDN博主「YotaGit」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/zwxlyg/article/details/115832405