selenium模块
浏览器驱动
from selenium import webdriver # 用来驱动浏览器的 from selenium.webdriver import ActionChains # 破解滑动验证码的时候用的 可以拖动图片 from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys # 键盘按键操作 from selenium.webdriver.support import expected_conditions as EC # 和下面WebDriverWait一起用的 from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 import time #获得驱动 chrome = webdriver.Chrome()
显隐等待
隐式等待:在browser.get('xxx')前就设置,针对所有元素有效 显式等待:在browser.get('xxx')之后设置,只针对某个元素有效 chrome.implicitly_wait(10) :之前 wait=WebDriverWait(browser,10) :之后
网页前进后退
try: chrome.get('https://www.baidu.com/') chrome.get('https://www.tmall.com/') chrome.get('https://www.jd.com/') # 后退 chrome.back() # 前进 chrome.forward() time.sleep(5) finally: chrome.close()
JS操作
try: chrome.get('https://www.baidu.com/') chrome.execute_script("alert('傻眼了吧!')") # chrome.execute_script(""" # scasfaf # """) time.sleep(5) finally: chrome.close()
get_elements_by_xpath
操作示例:
<html> <head> <base href='http://example.com/' /> <title>Example website</title> </head> <body> <div id='images'> <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a> <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a> <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a> <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a> </div> </body> </html>
使用路径表达式来选取 XML 文档中的节点或节点集
方法:tag_anme ,text ,get_attribute[] ,img.location
1.从根节点查找: /
2.从全局查找: //
3.查找某一层的下一层: //a/img
4.查找多个: get_elements_by_xpath('//a') ,得到一个列表
5.查找第3个a标签的img: get_elements_by_xpath('//div/a[3]/img') 不是按照索引;
6.查找id属性: get_elements_by_xpath('//*[@id = "imgages"]/a[3]/img')
交互操作
from selenium import webdriver # 用来驱动浏览器的 from selenium.webdriver import ActionChains # 破解滑动验证码的时候用的 可以拖动图片 from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys # 键盘按键操作 from selenium.webdriver.support import expected_conditions as EC # 和下面WebDriverWait一起用的 from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 import time chrome = webdriver.Chrome() chrome.implicitly_wait(10) try: chrome.get('https://www.tmall.com/') input_tag = chrome.find_element_by_id('mq') input_tag.send_keys('时间革命') input_tag.send_keys(Keys.ENTER) input_tag = chrome.find_element_by_id('mq') input_tag.clear() input_tag.send_keys('唐诗三百首') button = chrome.find_element_by_xpath('//*[@class="mallSearch-input clearfix"]/button') button.click() time.sleep(5) finally: chrome.close()
try: chrome.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') # source = chrome.find_element_by_id('draggable') # target = chrome.find_element_by_id('droppable') # print(source, target) # 切换子页面 # chrome.switch_to_frame('iframeResult') # 弃用方法 chrome.switch_to.frame('iframeResult') source = chrome.find_element_by_id('draggable') target = chrome.find_element_by_id('droppable') print(source, target) # 找父页面 # chrome.switch_to.parent_frame() # source = chrome.find_element_by_id('draggable') # target = chrome.find_element_by_id('droppable') # print(source, target) # 方式一: # ActionChains(chrome).drag_and_drop(source, target).perform() # 方式一: ActionChains(chrome).click_and_hold(source).perform() distance = target.location.get('x') - source.location.get('x') s = 0 while s < distance: ActionChains(chrome).move_by_offset(xoffset=1, yoffset=0).perform() s += 1 ActionChains(chrome).release().perform() time.sleep(5) finally: chrome.close()
爬取京东商品信息
from selenium import webdriver # 用来驱动浏览器的 from selenium.webdriver import ActionChains # 破解滑动验证码的时候用的 可以拖动图片 from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys # 键盘按键操作 from selenium.webdriver.support import expected_conditions as EC # 和下面WebDriverWait一起用的 from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 import time def drver_star(driver, key): try: div_obj = driver.find_element_by_id('J_goodsList') li_list = div_obj.find_elements_by_class_name('gl-item') # print(li_list) for li in li_list: # 商品链接 detail_link = li.find_element_by_css_selector('.p-img a').get_attribute('href') # 商品名称 g_name = li.find_element_by_css_selector('.p-name em').text # 商品价格 g_price = li.find_element_by_css_selector('.p-price i').text # 评论人数 g_commit = li.find_element_by_css_selector('.p-commit a').text goods = ''' ==============tank 商品信息 ================ 商品链接: %s 商品名称: %s 商品价格: %s 评论人数: %s \n ''' % (detail_link, g_name, g_price, g_commit) print(goods) with open('%s.txt' % key, 'a', encoding='utf-8') as f: f.write(goods) next_tag = driver.find_element_by_partial_link_text('下一页') next_tag.click() time.sleep(2) drver_star(driver, key) time.sleep(5) finally: driver.close() if __name__ == '__main__': key = input('请输入爬取的商品内容: ').strip() driver = webdriver.Chrome() driver.implicitly_wait(10) driver.get('https://www.jd.com/') input_tag = driver.find_element_by_id('key') input_tag.send_keys(key) input_tag.send_keys(Keys.ENTER) drver_star(driver, key)