Python selenium

利用pip安装selenium 命令pip install selenium

我们用selenium写个小例子,功能是打开百度主页,在搜索框中输入网络爬虫,进行搜索。代码如下

#coding:utf-8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
driver = webdriver.Firefox(executable_path='D:/Program Files (x86)/Mozilla Firefox/geckodriver.exe')

driver.get("http://www.baidu.com")

assert u"百度" in driver.title

elem = driver.find_element_by_name('wd')

elem.clear()

elem.send_keys(u"网络爬虫")
elem.send_keys(Keys.RETURN)
time.sleep(3)
assert u"网络爬虫" not in driver.page_source
driver.close()

如果出现以下错误:

selenium.common.exceptions.WebDriverException: Message: 'geckodriver' executable
needs to be in PATH.则需要下载geckodriver,该路径为geckodriver为存放目录D:/Program Files (x86)/Mozilla Firefox/geckodriver.exe

通过selenium元素选取

find_element_by_id            定位一个元素   find_elements_by_id           定位多个元素     通过元素id进行定位

find_element_by_name                              find_elements_by_name                                通过元素名称进行定位

find_element_by_xpath                              find_elements_by_xpath                                通过xpath表达式进行定位

find_element_by_link_text                          find_elements_by_link_text                            通过完整超链接文本进行定位

find_element_by_partial_link_text              find_elements_by_partial_link_text                 通过部分超链接文本进行定位

find_element_by_tag_name                       find_elements_by_tag_name                          通过标记名称进行定位

find_element_by_class_name                    find_elements_by_class_name                      通过类名进行定位

find_element_by_css_selector                   find_elements_by_css_selector                      通过css选择器进行定位

 

<html>
<head>
<meta http-equiv="content-type" content="text/html"; charset="utf-8">
</head>
<body>
<h1> Welcome </h1>
<p class="content">用户登录</p>
<form id = “loginForm”>
<select name="loginways">
<option value="email">邮箱</option>
<option value="mobile">手机号</option>
<option value="name">用户名</option>
</select>
<br/>
<input name ="username" type="text"/>
<br/>
密码
<br/>
<input name="password" type="password"/>
<br/><br/>
<input name ="continue" type="submit" value="Login"/>
<input name ="continue" type="button" value="Clear"/>
</form>
<a href ="register.html">Register</a>
</body>
</html>

#coding:utf-8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select #引入该包主要是用来操作select元素
import time

driver = webdriver.Firefox(c)

driver.get("login.html")

username = driver.find_element_by_name("username")
password = driver.find_element_by_xpath(".//*[@id='loginForm']/input[2]")

login_button = driver.find_element_by_xpath("//input[@type='submit']")

#获取select元素
select = Select(driver.find_element_by_xpath('//from/select '))
select.select_by_index(1)#根据索引选中一个元素
select.select_by_visible_text("邮箱")#根据select option内容获取
select.select_by_value(1)#根据值获取选项

username.send_keys("paul")#向输入框中输入内容
password.send_keys("floki")


login_button.click()

username.clear()#清除输入框内容
password.clear()

#元素的拖拽

元素的拖拽即将一个元素拖到另一个元素的位置,类似于拼图。首先要找到源元素和目的元素,然后使用ActionChains类可以实现。代码如下

from selenium.webdriver import ActionChains

element = driver.find_element_by_name("source")

target = driver.find_element_by_name("target")

action_chains = ActionChains(driver)

action_chains.drag_and_drop(element,target).perform()

窗口和页面frame的切换

一个浏览器一般都会开多个窗口,我们可以switch_to_window方法实现指定窗口的切换
driver.switch_to_window("windowName")
也可以通过window handle来获取每个窗口的操作对象.实例如下

for handle in driver.window_handles:

driver.switch_to_window(handle) 

如果切换页面frame,可以使用switch_to_frame
driver.switch_to_frame("frameName")
driver.switch_to_frame("frameName.0.child")

弹窗处理

如果在处理页面的过程中,触发了某个事件,跳出弹框。可以使用switch_to_alert获取弹框对象,从而进行关闭弹框,获取弹框信息等操作

alert = driver.switch_to_alert()
alert.dismiss()

历史记录

操作页面的前进和后退功能

driver.forward()
driver.back()

 

爬取去哪儿网

 

# coding:utf-8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select # 引入该包主要是用来操作select元素
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from datetime import timedelta
import time,datetime
import codecs

class QunaSpider(object):

def get_hotel(self,driver,to_city,fromdate,todate):

ele_toCity = driver.find_element_by_name('toCity')
ele_fromDate = driver.find_element_by_id('fromDate')
ele_toDate = driver.find_element_by_id('toDate')
ele_search = driver.find_element_by_class_name('search-btn')
ele_toCity.clear()
ele_toCity.send_keys(to_city)#输入框输入城市名称
ele_toCity.click()
ele_fromDate.clear()
ele_fromDate.send_keys(fromdate)
ele_toDate.clear()
ele_toDate.send_keys(todate)
ele_search.click()

page_num =0

while True:

try :
WebDriverWait(driver,10).until(EC.title_contains(unicode(to_city))


)
except Exception,e:
print e
break


time.sleep(5)
js = "window.scrollTo(0,document.body.scrollHeight);"
driver.execute_script(js)
time.sleep(5)

htm_const = driver.page_source
soup = BeautifulSoup(htm_const,'html.parser')
infos = soup.find_all(class_='item_hotel_info')
f = codecs.open(unicode(to_city)+unicode(fromdate)+u'.html','a','utf-8')

for info in infos:

f.write(str(page_num)+'--'*50)
content = info.get_text().replace(" ","").replace("\t","").strip()
for line in [ln for ln in content.splitlines() if ln.strip()]:
f.write(line)
f.write('\r\n')
try:

next_page = WebDriverWait(driver,10).until(
EC.visibility_of(driver.find_element_by_css_selector(".item.next"))
)
next_page.click()
page_num+=1
time.sleep(10)
except Exception,e:
print e
break
f.close()

def crawl(self,root_url,to_city):

today = datetime.date.today().strftime('%Y-%m-%d')
tomorrow = datetime.date.today() + datetime.timedelta(days=1)
tomorrow = tomorrow.strftime('%Y-%m-%d')

driver = webdriver.Firefox(executable_path='D:/Program Files (x86)/Mozilla Firefox/geckodriver.exe')
driver.set_page_load_timeout(50)
driver.get(root_url)
driver.maximize_window()
driver.implicitly_wait(10)
self.get_hotel(driver,to_city,today,tomorrow)

if __name__ == '__main__':

spider = QunaSpider()
spider.crawl('http://hotel.qunar.com/',u"上海")

 

posted on 2018-02-02 11:21  paulversion  阅读(177)  评论(0编辑  收藏  举报