爬虫之selenium
安装
1 进入虚拟环境下安装 selenium :在cmd下输入 activate base
2 pip install selenium
简介:
selenium 就是利用浏览器驱动模拟浏览器访问爬取页面,
优点:能有效的解决某些动态资源访问困难的问题
缺点:需要根据浏览器的具体版本选择下载浏览器驱动
谷歌浏览器驱动下载地址:http://chromedriver.storage.googleapis.com/index.html
下载的驱动程序必须和浏览器的版本统一,大家可以根据http://blog.csdn.net/huilan_same/article/details/51896672中提供的版本映射表进行对应
使用:
导入
from selenium import webdriver from time import sleep
加载驱动,
bro = webdriver.Chrome(executable_path=r'C:\Users\Administrator\Desktop\爬虫+数据\day04\chromedriver.exe')
方法:
#根据find系列的函数定位到指定的标签 my_input = bro.find_element_by_id('kw') #获取当前浏览器显示的页面的页面源码 page_text = bro.page_source bro.save_screenshot('./1.png') #截图 ############################################### from selenium.webdriver.chrome.options import Options chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') #谷歌无头浏览器 bro = webdriver.Chrome(executable_path=r'C:\Users\Administrator\Desktop\爬虫+数据\day04\chromedriver.exe', chrome_options=chrome_options) #滚轮滚到 当前页面最下端 js = 'window.scrollTo(0,document.body.scrollHeight)' bro.execute_script(js) #定位到指定的iframe bro.switch_to.frame('login_frame') bro.find_element_by_id('switcher_plogin').click()
百度迪丽热吧
from selenium import webdriver from time import sleep from lxml import etree bro = webdriver.Chrome(executable_path=r'G:\papa\day04\chromedriver.exe') bro.get(url='https://www.baidu.com') sleep(1) my_input=bro.find_element_by_id("kw") my_input.send_keys("迪丽热巴") sleep(3) bro.find_element_by_id('su').click() sleep(3) page_text = bro.page_source print(page_text) bro.quit()
无界面访问迪丽热吧
bro = webdriver.PhantomJS(executable_path=r'G:\papa\day04\phantomjs-2.1.1-windows\bin\phantomjs.exe') bro.get(url='https://www.baidu.com') sleep(1) my_input=bro.find_element_by_id("kw") my_input.send_keys("迪丽热巴") sleep(3) bro.find_element_by_id('su').click() sleep(3) page_text = bro.page_source print(page_text) bro.quit()
谷歌无头访问
from selenium.webdriver.chrome.options import Options chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') #谷歌无头浏览器 bro = webdriver.Chrome(executable_path=r'G:\papa\day04\chromedriver.exe',chrome_options=chrome_options) bro.get(url='https://www.baidu.com') sleep(1) my_input=bro.find_element_by_id("kw") my_input.send_keys("迪丽热巴") sleep(3) bro.find_element_by_id('su').click() sleep(3) page_text = bro.page_source print(page_text) bro.quit()
爬取豆瓣电影
bro = webdriver.Chrome(executable_path=r'G:\papa\day04\chromedriver.exe') url = 'https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=' bro.get(url=url) sleep(2) js='window.scrollTo(0,document.body.scrollHeight)' for i in range(3): bro.execute_script(js) sleep(3) page_text = bro.page_source print(page_text) bro.quit()
爬取qq空间
bro = webdriver.Chrome(executable_path=r'G:\papa\day04\chromedriver.exe')
url = 'https://qzone.qq.com/'
bro.get(url=url)
sleep(1)
#定位到frame
bro.switch_to.frame("login_frame")
bro.find_element_by_id("switcher_plogin").click()
sleep(1)
name = bro.find_element_by_id('u')
name.send_keys('用户名')
sleep(1)
pwd = bro.find_element_by_id('p')
pwd.send_keys('密码')
sleep(1)
bro.find_element_by_id("login_button").click()
sleep(1)
js='window.scrollTo(0,document.body.scrollHeight)'
for i in range(3):
bro.execute_script(js)
sleep(3)
page_text = bro.page_source
sleep(5)
tree =etree.HTML(page_text)
div_list=tree.xpath('//div[@class="f-info qz_info_cut"] | //div[@class="f-info"]')
for div in div_list:
text = div.xpath(".//text()")
text = ''.join(text)
print(text)
bro.quit()