selenium模块

浏览器驱动

from selenium import webdriver  # 用来驱动浏览器的
from selenium.webdriver import ActionChains  # 破解滑动验证码的时候用的 可以拖动图片
from selenium.webdriver.common.by import By  # 按照什么方式查找，By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
from selenium.webdriver.support import expected_conditions as EC  # 和下面WebDriverWait一起用的
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import time
#获得驱动
chrome = webdriver.Chrome()

显隐等待

隐式等待：在browser.get（'xxx'）前就设置，针对所有元素有效
显式等待：在browser.get（'xxx'）之后设置，只针对某个元素有效

chrome.implicitly_wait(10) :之前
wait=WebDriverWait(browser,10) ：之后

网页前进后退

try:
    chrome.get('https://www.baidu.com/')
    chrome.get('https://www.tmall.com/')
    chrome.get('https://www.jd.com/')

    # 后退
    chrome.back()
    # 前进
    chrome.forward()
    time.sleep(5)
finally:
    chrome.close()

JS操作

try:

    chrome.get('https://www.baidu.com/')

    chrome.execute_script("alert('傻眼了吧！')")
    # chrome.execute_script("""
    # scasfaf
    # """)

    time.sleep(5)
finally:
    chrome.close()

get_elements_by_xpath

操作示例：
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
   <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
   <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
  </div>
 </body>
</html>

　　使用路径表达式来选取 XML 文档中的节点或节点集

方法：tag_anme ,text ,get_attribute[] ,img.location

1.从根节点查找: /
2.从全局查找： //
3.查找某一层的下一层： //a/img
4.查找多个： get_elements_by_xpath('//a') ,得到一个列表
5.查找第3个a标签的img: get_elements_by_xpath('//div/a[3]/img') 不是按照索引；
6.查找id属性： get_elements_by_xpath('//*[@id = "imgages"]/a[3]/img'）

交互操作

from selenium import webdriver  # 用来驱动浏览器的
from selenium.webdriver import ActionChains  # 破解滑动验证码的时候用的 可以拖动图片
from selenium.webdriver.common.by import By  # 按照什么方式查找，By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
from selenium.webdriver.support import expected_conditions as EC  # 和下面WebDriverWait一起用的
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import time

chrome = webdriver.Chrome()
chrome.implicitly_wait(10)
try:

    chrome.get('https://www.tmall.com/')
    input_tag = chrome.find_element_by_id('mq')
    input_tag.send_keys('时间革命')
    input_tag.send_keys(Keys.ENTER)

    input_tag = chrome.find_element_by_id('mq')
    input_tag.clear()
    input_tag.send_keys('唐诗三百首')

    button = chrome.find_element_by_xpath('//*[@class="mallSearch-input clearfix"]/button')
    button.click()

    time.sleep(5)

finally:
    chrome.close()

自动查找商品

try:

    chrome.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    # source = chrome.find_element_by_id('draggable')
    # target = chrome.find_element_by_id('droppable')
    # print(source, target)
    # 切换子页面
    # chrome.switch_to_frame('iframeResult')  # 弃用方法
    chrome.switch_to.frame('iframeResult')
    source = chrome.find_element_by_id('draggable')
    target = chrome.find_element_by_id('droppable')
    print(source, target)

    # 找父页面
    # chrome.switch_to.parent_frame()
    # source = chrome.find_element_by_id('draggable')
    # target = chrome.find_element_by_id('droppable')
    # print(source, target)

    # 方式一:
    # ActionChains(chrome).drag_and_drop(source, target).perform()

    # 方式一:
    ActionChains(chrome).click_and_hold(source).perform()
    distance = target.location.get('x') - source.location.get('x')
    s = 0
    while s < distance:
        ActionChains(chrome).move_by_offset(xoffset=1, yoffset=0).perform()
        s += 1

    ActionChains(chrome).release().perform()

    time.sleep(5)

finally:
    chrome.close()

自动校验移动验证码

爬取京东商品信息

from selenium import webdriver  # 用来驱动浏览器的
from selenium.webdriver import ActionChains  # 破解滑动验证码的时候用的 可以拖动图片
from selenium.webdriver.common.by import By  # 按照什么方式查找，By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
from selenium.webdriver.support import expected_conditions as EC  # 和下面WebDriverWait一起用的
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import time

def drver_star(driver, key):

    try:
        div_obj = driver.find_element_by_id('J_goodsList')
        li_list = div_obj.find_elements_by_class_name('gl-item')
        # print(li_list)
        for li in li_list:
            # 商品链接
            detail_link = li.find_element_by_css_selector('.p-img a').get_attribute('href')

            # 商品名称
            g_name = li.find_element_by_css_selector('.p-name em').text

            # 商品价格
            g_price = li.find_element_by_css_selector('.p-price i').text

            # 评论人数
            g_commit = li.find_element_by_css_selector('.p-commit a').text

            goods = '''

            ==============tank 商品信息 ================
                商品链接: %s
                商品名称: %s
                商品价格: %s
                评论人数: %s
            \n
            ''' % (detail_link, g_name, g_price, g_commit)
            print(goods)

            with open('%s.txt' % key, 'a', encoding='utf-8') as f:
                f.write(goods)

        next_tag = driver.find_element_by_partial_link_text('下一页')
        next_tag.click()
        time.sleep(2)
        drver_star(driver, key)


        time.sleep(5)
    finally:
        driver.close()

if __name__ == '__main__':
    key = input('请输入爬取的商品内容: ').strip()
    driver = webdriver.Chrome()
    driver.implicitly_wait(10)
    driver.get('https://www.jd.com/')
    input_tag = driver.find_element_by_id('key')
    input_tag.send_keys(key)
    input_tag.send_keys(Keys.ENTER)
    drver_star(driver, key)

商品信息

posted @ 2019-03-09 19:21 ChuckXue 阅读(132) 评论(0) 编辑收藏举报

刷新页面返回顶部

量变引发质变

selenium模块

浏览器驱动

显隐等待

网页前进后退

JS操作

get_elements_by_xpath

交互操作

爬取京东商品信息

公告