爬虫 selenium
chromedriver下载
谷歌浏览器驱动下载地址:http://chromedriver.storage.googleapis.com/index.html http://npm.taobao.org/mirrors/chromedriver/ 下载的驱动程序必须和浏览器的版本统一,可以根据http://blog.csdn.net/huilan_same/article/details/51896672中提供的版本映射表进行对应
开启浏览器的前端的爬虫
from selenium import webdriver from time import sleep
bro = webdriver.Chrome(executable_path=r'D:\爬虫存储\chromedriver.exe')
bro.get(url='https://www.baidu.com/')
sleep(2) bro.find_element_by_id('kw').send_keys('python') sleep(1) bro.find_element_by_id('su').click() time.sleep(2)
with open('baidu.html', 'w', encoding='utf8') as f: f.write(bro.page_source)
bro.quit()
不开启浏览器的前端的爬虫
from selenium.webdriver.chrome.options import Options
chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu')
url = 'https://movie.douban.com/typerank?type_name=%E6%83%8A%E6%82%9A&type=19&interval_id=100:90&action=' bro = webdriver.Chrome(executable_path=r'D:\爬虫存储\chromedriver.exe', chrome_options=chrome_options)
bro.get(url)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') for i in range(2): sleep(1) bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(5)
with open('douban.html', 'w', encoding='utf8') as f: f.write(bro.page_source)
bro.quit()
获取浏览器的实时图片和设置浏览器的大小
from selenium.webdriver.chrome.options import Options chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') url = r'www.baidu.com' bro = webdriver.Chrome(executable_path=r'D:\爬虫存储\chromedriver.exe', chrome_options=chrome_options) bro.set_window_size(7680, 4320) bro.get(url) sleep(30) data = bro.get_screenshot_as_png() with open('1.png', 'wb') as f: f.write(data) bro.quit()
在碰到iframe的情况下, 使用选择id等都会找不到, 解决方法
bro.switch_to_frame('login_frame') bro.find_element_by_id('switcher_plogin').click() bro.find_element_by_id('u').send_keys('1132300949') bro.find_element_by_id('login_button').click() page_text = bro.page_source