Python 解析Html
XPath
常用匹配规则:
符号 |
描述 |
/ |
从当前节点,选取子节点 |
// |
从当前节点,选取子孙节点 |
. |
选取当前节点 |
.. |
选择当前节点的父节点 |
@ |
选择属性 |
属性获取:
from lxml import etree html = '<div><a class="du" href="http://www.baidu.com">百度</a></div>' parser = etree.HTML(html) result = parser.xpath('//a[@class="du"]/@href') print(result)
文本获取:
from lxml import etree html = '<div><a class="du" href="http://www.baidu.com">百度</a></div>' parser = etree.HTML(html) result = parser.xpath('//a[@class="du"]/text()') print(result)
属性多值匹配:
from lxml import etree html = '<div><a class="du baidu" href="http://www.baidu.com">百度</a></div>' parser = etree.HTML(html) result = parser.xpath('//a[contains(@class,"du")]/text()') print(result)
多属性匹配:
from lxml import etree html = '<div><a name="item" class="du baidu" href="http://www.baidu.com">百度</a></div>' parser = etree.HTML(html) result = parser.xpath('//a[contains(@class,"du") and @name="item"]/text()') print(result)
按序选择:
from lxml import etree html = """ <li>item1</li> <li>item2</li> <li>item3</li> <li>item4</li> <li>item5</li> """ parser = etree.HTML(html) result = parser.xpath('//li[1]/text()') #匹配第一个 print(result) result = parser.xpath('//li[last()]/text()') #匹配最后一个 print(result) result = parser.xpath('//li[position()<3]/text()') #匹配第一、第二个 print(result) result = parser.xpath('//li[last()-2]/text()') #匹配倒数第三个 print(result)
更多用法:http://www.w3school.com.cn/xpath/xpath_functions.asp
Beautiful Soup
节点选择器:
from bs4 import BeautifulSoup html = """ <div> <li class="d1">item1</li> <li class="d2">item2</li> <li class="d3">item3</li> <li class="d4">item4</li> <li class="d5">item5</li> </div> """ soup = BeautifulSoup(html,'lxml') result = soup.div.children print(result) for value in result: print(value.string)
方法选择器:
# find_all(name,attrs,recursive,text,**kwargs) from bs4 import BeautifulSoup html = """ <div> <li class="d1">item1</li> <li class="d2">item2</li> <li class="d3">item3</li> <li class="d4">item4</li> <li class="d5">item5</li> </div> """ soup = BeautifulSoup(html,'lxml') result = soup.find_all(name="div") for value in result: result = value.find_all(name="li",class_="d3")[0].get_text() # 等价于string print(result)
Css选择器:
from bs4 import BeautifulSoup html = """ <div> <li class="d1">item1</li> <li class="d2">item2</li> <li class="d3">item3</li> <li class="d4" name="d">item4</li> <li class="d5">item5</li> </div> """ soup = BeautifulSoup(html,'lxml') result = soup.select('div li[name="d"]') for value in result: print(type(value)) print(value.get_text())
Pyquery
初始化
字符串初始化:
from pyquery import PyQuery as pq html = "<a href='http://www.baidu.com'>百度一下</a>" parser = pq(html)
URL初始化:
from pyquery import PyQuery as pq parser = pq(url="http://www.baidu.com") print(parser)
文件初始化:
from pyquery import PyQuery as pq parser = pq(filename="demo.html") print(parser)
查找节点
Css选择器:
html = """ <div class="qrcode-text" id="1"> 我是div标签的文本 <p class="title">我是标题<a href="http://www.baidu.com">百度一下</a></p> <p class="content">我是内容</p> </div> """ from pyquery import PyQuery as pq parser = pq(html) result = parser(".qrcode-text .title a") print(result)
children() 查找子节点
find() 查找子孙节点
parent() 查找父节点
parents() 查找祖先节点
siblings() 查找兄弟节点
html = """ <body> <div class="qrcode-text" id="1"> 我是div标签的文本 <p class="title">我是标题<a class="du" href="http://www.baidu.com">百度一下</a></p> <p class="content">我是内容 <span class="first">第一行</span> </p> </div> </body> """ from pyquery import PyQuery as pq parser = pq(html) result = parser(".content").children() print(result) result = parser.find("span") print(result) result = parser("span").parent() print(result) result = parser("span").parents("#1") print(result) result = parser(".title").siblings() print(result)
获取信息
获取属性 attr()
内部文本 text()
html文本 html()
html = """ <body> <div class="item_1"><span>1.</span>第一行</div> <div class="item_2"><span>2.</span>第二行</div> <div class="item_3"><span>3.</span>第三行</div> </body> """ from pyquery import PyQuery as pq parser = pq(html) result = parser("div") for value in result.items(): print(value.attr("class")) print(value.text()) print(value.html())
节点操作
对节点进行动态修改。
removeClass()
addClass()
html = """ <body> <div class="item_1"><span>1.</span>第一行</div> <div class="item_2"><span>2.</span>第二行</div> <div class="item_3"><span>3.</span>第三行</div> </body> """ from pyquery import PyQuery as pq parser = pq(html) result = parser("div") for n,value in enumerate(result.items(),1): value.removeClass(value.attr("class")) value.addClass(str(n)) print(value)
attr()
text()
html = """ <body> <div class="item_1"><span>1.</span>第一行</div> <div class="item_2"><span>2.</span>第二行</div> <div class="item_3"><span>3.</span>第三行</div> </body> """ from pyquery import PyQuery as pq parser = pq(html) result = parser("div") for n,value in enumerate(result.items(),1): value.attr(id=str(n)) value.text('Hello World') print(value)
remove()
html = """ <body> Hello World! <div class="item_1"><span>1.</span>第一行</div> <div class="item_2"><span>2.</span>第二行</div> <div class="item_3"><span>3.</span>第三行</div> </body> """ from pyquery import PyQuery as pq parser = pq(html) result = parser("body") value = result.remove("div") print(value.text())
更多用法:http://pyquery.readthedocs.io/en/latest/api.html