爬虫04--selenium库、打码平台、Xpath语法

1 selenium库

# selenium是一个Web自动化测试工具
爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题

# 作用：操作浏览器，模拟人的行为，支持多种浏览器

from selenium import webdriver
browser=webdriver.Chrome()
browser=webdriver.Firefox()
browser=webdriver.PhantomJS()
browser=webdriver.Safari()
browser=webdriver.Edge() 

# 下载浏览器驱动：以谷歌浏览器为例（对应自己浏览器的版本号）
http://npm.taobao.org/mirrors/chromedriver/

下载chromdriver.exe放到python安装路径的scripts目录中即可,则打开浏览器，不用给executable_path传参
或下载到项目目录下，executable_path='./chromedriver.exe'
    
    
# 安装selenium模块
pip3 install selenium

1.0 基本使用

from selenium import webdriver
import time

# 浏览器对象
bro = webdriver.Chrome(executable_path='./chromedriver.exe')

# 隐式等待，去找控件，如果没有会等10s
bro.implicitly_wait(10)  

# 在浏览器中输入一个网站 ,并访问
bro.get('https://www.baidu.com/')

# 找到页面中登录按钮
# sub_button = bro.find_element_by_css_selector('#s-top-loginbtn') # css选择器
sub_button = bro.find_element_by_id('s-top-loginbtn')  # 如果有id，优先用它

# 点击登录按钮
sub_button.click()

# 找到用户名密码 登录按钮
user_btn = bro.find_element_by_xpath('//*[@id="TANGRAM__PSP_11__footerULoginBtn"]')
# user_btn=bro.find_element_by_id('TANGRAM__PSP_11__footerULoginBtn')
user_btn.click()

username = bro.find_element_by_id('TANGRAM__PSP_11__userName')
password = bro.find_element_by_id('TANGRAM__PSP_11__password')

# 往输入框中写东西
username.send_keys('6666666@qq.com')
password.send_keys('lqz12345')

sumbit_btn = bro.find_element_by_id('TANGRAM__PSP_11__submit')
time.sleep(3)

# 点击该控件
sumbit_btn.click()

time.sleep(3)

# 关闭浏览器当前标签页
bro.close()

# 退出(关闭)整个浏览器
bro.quit()

1.1 无界面浏览器

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()

# 指定浏览器分辨率
chrome_options.add_argument('window-size=1920x3000')
# 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--disable-gpu')
# 隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('--hide-scrollbars')
# 不加载图片, 提升速度
chrome_options.add_argument('blink-settings=imagesEnabled=false')
# 浏览器不提供可视化页面. linux下如果系统不支持可视化，不加这条会启动失败
chrome_options.add_argument('--headless')
# 取消自动化软件测试检验
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])


bro = webdriver.Chrome(executable_path='./chromedriver.exe', options=chrome_options)

# 在浏览器中输入一个网站 ,并访问
bro.get('https://www.cnblogs.com/xiaoyuanqujing/articles/11805718.html')
print(bro.page_source)
bro.close()

1.2 查找元素、获取元素位置/属性/大小

# 1.查找元素的方法
  以往是 find_element_by_xx()  现在使用find_element(By.xx, '') 代替
    
from selenium.webdriver.common.by import By

driver.find_element(By.xx, '')  # 标签有id, 优先是id(唯一)
# .find_elementst(By.xx, '') 的形式是查找到多个元素，结果为列表

# 自带选择器
  By.ID          # 标签 id
    .CLASS_NAME  # 标签 类名
    .TAG_NAME    # 标签 名
    .NAME        # 标签 name属性
    .LINK_TEXT   # 连接标签的内容 eg: a标签内容
    .PARTIAL_LINK_TEXT  # 连接标签的部分内容 eg: a标签内容 模糊匹配
    
# 三方选择器
    .XPATH         # css选择器 根据css查找标签的规则  eg: #id .类名
    .CSS_SELECTOR  # XPath选择器
    # 看源码，以上内部本质其实都是 By.CSS_SELECTOR
    
    
# 2.获取元素的内容
.id        # selenium提供的id，忽略

.text      # 获取文本内容
.tag_name  # 获取标签名
.location  # 获取位置 (x, y)
.size      # 获取大小 (height, width)

    # 后续根据位置和大小把图截出来，一般是验证码，破解，自动输入
    图片四个点： 
    x, y           x+widht, y
    x, y+height    x+widht, y+height

    
# 3.获取属性
.get_attribute('src')

# eg：

from selenium import webdriver
from selenium.webdriver.common.by import By
import time

driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://kyfw.12306.cn/otn/resources/login.html')
driver.implicitly_wait(10)

# 查找元素--根据css选择器
user_login = driver.find_element(By.CSS_SELECTOR, '.login-hd-account>a')

# 点击控件
user_login.click()
time.sleep(2)

img = driver.find_element(By.ID, 'J-qrImg')
print(img)  # 是selenium的元素类


# 获取标签名
img.tag_name  

# 获取位置 (x, y)
img.location  

# 获取大小 (height, width)
img.size  

# 获取属性
print(img.get_attribute('class'))

driver.close()

1.3 等待元素被加载

# 两种等待方式
  显示等待: 指定某个标签等待，找到元素后，需要手动设置等待      麻烦且少用  忽略
  隐式等待：等待所有要获取的标签，如果没有加载出来，就会等待10s  常用且简单  一句话


from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait  
from selenium.webdriver.support import expected_conditions as EC


driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://www.baidu.com')


# 两种等待方式
# 显示等待(忽略掉)
    wait=WebDriverWait(driver,10)
    wait.until(EC.presence_of_element_located((By.ID,'content_left')))
    
    contents=driver.find_element(By.CSS_SELECTOR,'#content_left')
    
# 隐式等待：获取元素前设置
    driver.implicitly_wait(10)  # 一句话，查找元素之前设置
    
    driver.find_element(By.CSS_SELECTOR,'#content_left')
    

driver.implicitly_wait(10)  # 再找控件，只要没加载成功，就会等待，最多等10s

print(driver.page_source)
driver.close()

1.4 元素交互操作

1.4.1 输入、清空、点击、提交

# 元素操作: 输入、清空、点击、提交

from selenium import webdriver
import time

driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://www.baidu.com')
driver.implicitly_wait(10)

input_search=driver.find_element_by_id('kw')

# 输入
input_search.send_keys('美女')  
time.sleep(3)

# 清空
input_search.clear() 
time.sleep(2)

btn=driver.find_element_by_id('su')

# 点击
btn.click()  
time.sleep(10)

# 提交
btn.submit() 

driver.close()

1.4.2 动作链

# 常用来处理: 滑动验证码、点击移动某个元素


# 方法：
  from selenium.webdriver import ActionChains

  # 获取动作链对象
  actions=ActionChains(driver)  

  # 把source元素 拖拽到 target元素 当中，松开左键
  .drag_and_drop(sourse,target)  

  # 按偏移量拖拽hk元素(XY轴的距离 单位:px) 
  .drag_and_drop_by_offset(hk,300,0)

  # 左键点击source元素，保持不松开
  .click_and_hold(sourse)

  # 将鼠标移动到 偏移位置
  .move_by_offset(xoffset=2,yoffset=0)

  # 松开左键
  .release()

  # 执行动作链对象
  .perform()  



# eg:

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By  # 按照什么方式查找，By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
from selenium.webdriver.support import expected_conditions as EC

import time

driver = webdriver.Chrome()
driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
driver.implicitly_wait(3)  # 使用隐式等待

try:
    # 切换到id为iframeResult的 frame框架
    driver.switch_to.frame('iframeResult') 
    # 切回父frame
    # browser.switch_to.parent_frame()
    
    # 分别找到两个控件
    sourse=driver.find_element_by_id('draggable')
    target=driver.find_element_by_id('droppable')

# 方式一：基于同一个动作链，串行执行
	actions=ActionChains(driver)  # 拿到动作链对象
    
    # 把source元素 直接拖拽到 target元素 当中
	actions.drag_and_drop(sourse,target)  
    
    # 按偏移量拖拽hk元素(XY轴的距离 单位:px) 
    # actions.drag_and_drop_by_offset(hk,300,0)
    
	actions.perform()  # 释放鼠标，让动作链执行

# 方式二：基于不同的动作链，分步执行
    思路:
        1.分别获取source和target元素的x轴位置，获取需要移动x轴像素
        2.点击不松开source元素
        3.将当前鼠标 循环按照固定偏移量进行移动
        4.松开鼠标

    ActionChains(driver).click_and_hold(sourse).perform()
    distance=target.location['x']-sourse.location['x']

    track=0
    while track < distance:
        ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
        track+=2

    ActionChains(driver).release().perform()
    time.sleep(10)


finally:
    driver.close()


# 注:
  1.frame框架相当于一个单独的网页，在父frame里是无法直接查看到子frame的元素的，
    必须switch_to.frame切到该frame下，才能进一步查找
    了解frame就行，因为性能问题，现在一般都弃用了
    
  2.在交互动作比较难实现的时候可以自己写JS(万能方法) 执行JS代码

1.5 执行js

# 执行js语句
.execute_script('js语句') 


# eg:
from selenium import webdriver
import time

driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('http://127.0.0.1:8000/')
driver.implicitly_wait(10)

# 执行js代码
driver.execute_script("alert(document.cookies)")  # 获取cookie 

time.sleep(5)
driver.close()

1.6 其他操作(了解)

1.6.1 切换标签页

# 获取所有标签页
.window_handles

# 切换浏览器的标签页
.switch_to.window(browser.window_handles[1])


# eg:
import time
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.baidu.com')

# 用js代码 新建标签页
browser.execute_script('window.open()')

# 获取所有标签页
print(browser.window_handles) 

# 切换标签页
browser.switch_to.window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(5)


browser.switch_to.window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')

# 关闭当前标签页
browser.close()

1.6.2 模拟前进后退

import time
from selenium import webdriver

browser=webdriver.Chrome(executable_path='chromedriver.exe')
browser.get('https://www.baidu.com')
browser.get('https://www.taobao.com')
browser.get('http://www.sina.com.cn/')

# 后退
browser.back()
time.sleep(3)

# 前进
browser.forward()

browser.close()

1.6.3 异常处理

# selenium若是执行代码错误，浏览器不会关闭，故常用异常处理包裹

from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException

browser = webdriver.Chrome()

try:
    browser.get('http://www.baidu.com')
except Exception as e:
    print(e)
finally:
    browser.close()

1.6.4 cookies操作

# cookiesc操作
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')

# 获取所有的cookies
print(browser.get_cookies())

# 设置cookies
browser.add_cookie({'k1':'xxx','k2':'yyy'})
print(browser.get_cookies())

# 删除所有的cookies
browser.delete_all_cookies()

1.6.5 指定窗口大小

# 全屏
driver.maximize_window()
 
# 具体大小
driver.set_window_size(width, height)

1.6.6 切换frame框架页面

iframe是html中常用的一种技术，即一个页面中嵌套了另一个网页，selenium默认是访问不了frame中的内容的，对应的解决思路是

# 切换到某个 子frame
driver.switch_to.frame(name | el | id)  # 传入的参数 为iframe对应的id值 或 用元素定位之后的元素对象

# 切回父frame
driver.switch_to.parent_frame()


# eg：qq邮箱
在使用selenium登录qq邮箱的过程中，发现无法在邮箱的登录input标签中输入内容
通过观察源码可以发现，form表单在一个frame中，所以需要切换到frame中

1.6.7 处理页面弹窗

当你触发了某个事件之后，页面出现了弹窗提示，处理这个提示或者获取提示信息

alert = driver.switch_to_alert()

1.7 案例

1.7.1 12306自动登录--滑动验证码

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import time
from selenium.webdriver import ActionChains


from selenium.webdriver.chrome.options import Options

# 防止前端检测出我们是通过自动化软件控制的
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")

driver=webdriver.Chrome(executable_path='./chromedriver.exe',chrome_options=options)
driver.implicitly_wait(3)  
driver.get('https://kyfw.12306.cn/otn/resources/login.html')

try:
    username_login=driver.find_element_by_link_text('账号登录')
    username_login.click()

    username=driver.find_element_by_id('J-userName')
    password=driver.find_element_by_id('J-password')
    username.send_keys('18953675221')
    password.send_keys('fddddere')
    time.sleep(2)
    submit=driver.find_element_by_id('J-login')
    submit.click()
    time.sleep(5)

    
    hk=driver.find_element_by_id('nc_1_n1z')

    actions = ActionChains(driver)  # 拿到动作链对象
    actions.drag_and_drop_by_offset(hk,300,0)  # 按偏移量拖拽hk元素(XY轴的距离 单位:px) 
    actions.perform()  # 执行动作链
    
    time.sleep(50)
finally:
    driver.close()

1.7.2 博客园半自动登录--获取cookie

# 目的：为了登录到某个网站，拿到cookies

# 操作流程:  能自动登录最好，不行就是半自动登录的方式(就是自己输密码或验证码)
半自动登录到cnblogs，拿到cookie，保存到本地
下次，再打开页面，直接将本地cookie写入到浏览器，就是登录状态


import time
from selenium import webdriver

driver = webdriver.Chrome()
driver.implicitly_wait(10)

###########  登录过程###########
try:
    driver.get('https://www.cnblogs.com/')

    # 找到登录，点击
    login = driver.find_element_by_css_selector('#navbar_login_status > a:nth-child(6)')
    login.click()
    username = driver.find_element_by_id('mat-input-0')
    password = driver.find_element_by_id('mat-input-1')
    username.send_keys('616564099@qq.com')
    password.send_keys('lqz12345')

    input('手动输入密码')

    summit = driver.find_element_by_css_selector(
        'body > app-root > mat-sidenav-container > mat-sidenav-content > div > div > app-sign-in > app-content-container > div > div > div > form > div > button')
    summit.click()

    # 验证码（自动破解最好、不行就自己手动破解）
    input('已经破解了验证码，敲回车')

    # 获取cookie
    print(type(driver.get_cookies()))

    # 把cookie保存到文件中
    import json

    with open('cnblogs.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(driver.get_cookies()))

    time.sleep(5)

except Exception as e:
    print(e)

finally:
    driver.close()


#### 直接使用cookie登录
import json

driver.get('https://www.cnblogs.com/')
# 把cookie写入浏览器
with open('cnblogs.json', 'r', encoding='utf-8') as f:
    cookies = json.loads(f.read())

for cookie in cookies:  # json格式是列表形式，套字典，放一个个字典，所以用循环
    driver.add_cookie(cookie)

# 刷新一下页面
driver.refresh()
time.sleep(10)

driver.close()

1.7.3 抽屉半自动登录--requests点赞

from selenium import webdriver
import json
import time

#### selenium 半自动登录过程
bro=webdriver.Chrome(executable_path='chromedriver.exe')
bro.implicitly_wait(10)
bro.get('https://dig.chouti.com/')
try:
    sub_btn=bro.find_element_by_id('login_btn')
    print(sub_btn)

    # sub_btn.click()  # 报错
    bro.execute_script('arguments[0].click();',sub_btn)

    # username=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-body > div.form-item.login-item.clearfix.phone-item.mt24 > div.input-item.input-item-short.left.clearfix > input')
    username=bro.find_element_by_css_selector('div.input-item>input.login-phone')
    username.send_keys('18953675221')
    # password=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div.form-item.login-item.clearfix.mt24 > div')
    password = bro.find_element_by_css_selector('div.input-item>input.pwd-password-input')
    password.send_keys('lqz123')

    time.sleep(3)
    btn=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div:nth-child(4) > button')

    btn.click()

    input('等')

    with open('chouti.json','w') as f:
        json.dump(bro.get_cookies(),f)

finally:
    bro.close()


#### requests 点赞POST请求过程
import requests

bro=webdriver.Chrome(executable_path='chromedriver.exe')
bro.implicitly_wait(10)
bro.get('https://dig.chouti.com/')

# 把屏幕滑倒最底下
bro.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# bro.find_elements_by_css_selector('.link-item')
cookie={}

# 从文件中读出cookie
with open('chouti.json','r') as f:
    res=json.load(f)
for item in res:
    cookie[item['name']]=item['value']

print(cookie) # requests能够使用的cookie


div= bro.find_element_by_class_name('link-con')
time.sleep(2)
header={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}

div_list=div.find_elements_by_class_name('link-item')
for div in div_list:
    article_id=div.get_attribute('data-id')
    print(article_id)
    # 使用requests发送请求
    res=requests.post('https://dig.chouti.com/link/vote',data={'linkId': article_id},cookies=cookie,headers=header)
    print(res.text)
bro.close()

2 打码平台的使用

# 验证码破解
  -人工
  -自动
    -简单的字母、数字组合---》验证码截图---》图像识别模块(OCR)
    -滑动验证
    -计算验证
    
# 验证码破解平台  无法保证100%成功
  -云打码，超级鹰
    -给它一张图片---》结果返回 （收费的）

# 超级鹰平台

# 本质：就是向该平台发送了一个POST请求 (请求参数、图片、请求头)
    请求参数：user、pass2、softid、codetype、

import requests
from hashlib import md5


class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password = password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
                          headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()


if __name__ == '__main__':
    chaojiying = Chaojiying_Client('306334678', 'lqz12345', '903641')  # 用户中心>>软件ID 生成一个替换 96001
    im = open('a.jpg', 'rb').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    print(chaojiying.PostPic(im, 1902))  # 1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()

3 xpath选择器语法

# 1 XPath 是一门在XML文档中查找信息的语言

# 2.XPath 使用路径表达式来选取 XML 文档中的节点或者节点集。
  这些路径表达式和我们在常规的电脑文件系统中看到的表达式非常相似。

# 3.记住的语法：
    标签名   选取此节点下的所有标签
    /	    从根节点选取    # 取子标签
    //	    从当前节点选取  # 取子孙标签
    .	    选取当前节点
    ..	    选取当前节点的父节点
    @	    选取属性
    *       所有标签
    
# 4.lxml解析模块提供的xpath

doc='''
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html' name='sss'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <a href='image2.html' name='lqz'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
   <a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
   <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
   <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
  </div>
 </body>
</html>
'''
# 以lxml模块为例
from lxml import etree

html=etree.HTML(doc)  # 要解析的字符串（html）
# 如果是文件，使用这个
# html=etree.parse('search.html',etree.HTMLParser())


### 注意1： .xpath()  和 etree.tostring() 
etree对象.xpath(xpath语法)  # 返回一个列表，包含 匹配到的 节点对象

etree.tostring(节点对象[0], encoding='UTF-8').decode('UTF-8')   # 将xpath的元素对象，转成可见的字符串

### 注意2： 节点对象 可继续xpath匹配，但需要 '.' 获取当前节点
节点对象.xpath('./div')  # 选择该节点对象 下的div子节点

节点对象.xpath('/div')   # 没有'.'  '/' 又会从根节点中找div子节点
      # 操作系统中的 文件路径很类似
    

# 1 所有节点
a=html.xpath('//*')
a=html.xpath('/*')

# 2 指定节点（结果为列表）
a=html.xpath('//head')  # 找出所有的head标签

# 3 子节点，子孙节点
a=html.xpath('//div/a')
a=html.xpath('//body/a') # 无数据
a=html.xpath('//body//a')

# 4 父节点
a=html.xpath('//body//a[@href="image1.html"]/..')
a=html.xpath('//body//a[1]/..')  # 第一个a标签的父节点 div
a=html.xpath('//a[1]')  # 第一个a标签

# 也可以这样
a=html.xpath('//body//a[1]/parent::*')

# 5 属性匹配
a=html.xpath('//body//a[@href="image1.html"]')
a=html.xpath('//a[@href="image1.html"]')
a=html.xpath('//base[@href="http://example.com/"]')
a=html.xpath('//*[@href="http://example.com/"]')

# 6 文本获取
a=html.xpath('//body//a[@href="image1.html"]/text()')
a=html.xpath('//a/text()')

# 7 属性获取
a=html.xpath('//body//a/@href')
a=html.xpath('//body//a[1]/@xx')
# 注意从1 开始取（不是从0）
a=html.xpath('//body//a[1]/@href')

# 8 属性多值匹配
#  a 标签有多个class类，直接匹配就不可以了，需要用contains 包含
a=html.xpath('//body//a[@class="li"]')
a=html.xpath('//body//a[contains(@class,"li")]')
a=html.xpath('//body//a[contains(@class,"li")]/text()')

# [starts-with(@属性, "he")]  属性为以什么开头的
a=html.xpath('//div[starts-with(@id, "he")])   # 查询所有id属性中 以he开头的div标签

             
# 9 多属性匹配
a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')   # 或      
a=html.xpath('//body//a[contains(@class,"li")] | //body//a[@name="items"] ')   
             # 注意: "|" 两边必须是完整的xpath路径
             
a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')   # 且             
a=html.xpath('//body//a[contains(@class,"li")]/text()')

# 10 按序选择
a=html.xpath('//a[2]/text()')
a=html.xpath('//a[2]/@href')
a=html.xpath('//a[contains(@class,"vervideo-lilink")]/@href')
# 取最后一个
a=html.xpath('//a[last()]/@href')
# 位置小于3的
a=html.xpath('//a[position()<3]/@href')
# 倒数第二个
a=html.xpath('//a[last()-2]/@href')

# 11 节点轴选择
# ancestor：祖先节点
# 使用了* 获取所有祖先节点
a=html.xpath('//a/ancestor::*')
# 获取祖先节点中的div
a=html.xpath('//a/ancestor::div')
# attribute：属性值
a=html.xpath('//a[1]/attribute::*')
# child：直接子节点
a=html.xpath('//a[1]/child::*')
a=html.xpath('//a[1]/child::img')
# descendant：所有子孙节点
a=html.xpath('//a[6]/descendant::*')

# following:当前节点之后所有节点
a=html.xpath('//a[1]/following::*')
a=html.xpath('//a[1]/following::*[1]/@href')

# following-sibling:当前节点之后同级节点
a=html.xpath('//a[1]/following-sibling::*')
a=html.xpath('//a[1]/following-sibling::a')
a=html.xpath('//a[1]/following-sibling::*[2]')
a=html.xpath('//a[1]/following-sibling::*[2]/@href')

print(a)

posted @ 2022-06-05 18:09 Edmond辉仔阅读(98) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· 13--selenium回顾

· 17--Scrapy03:分页、模拟登录与中间件

· selenium的使用

· selenium、xpath、打码平台

· 「爬虫04」selenium

阅读排行：
· TypeScript + Deepseek 打造卜卦网站：技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 从HTTP原因短语缺失研究HTTP/2和HTTP/3的设计差异
· 三行代码完成国际化适配，妙~啊~

公告

昵称： Edmond辉仔
园龄： 4年
粉丝： 14
关注： 10

+加关注

2025年3月

日

一

二

三

四

五

六

Edmond辉仔