Xpath解析库的使用
### Xpath常用规则 ## nodename 选取此节点的所有子节点 ## / 从当前节点选取直接子节点 ## // 从当前节点选取子孙节点 ## . 选取当前节点 ## .. 选取当前节点的父节点 ## @ 选取属性 ### 测试文本 text = ''' <ul id="dmr" name="liebiao"> <li data-closeper="" aria-label="查看更多" role="menuitem" aria-haspopup="true" data-groupid="104" class="J_Cat a-all"> <a data-cid="1" data-dataid="222878" >家电</a> <a data-cid="1" data-dataid="222908" >数码</a> <a data-cid="1" data-dataid="222879" >手机 <i aria-hidden="true" class="tb-ifont service-arrow"></i> </li> </ul> '''
1. etree示例引入
## etree示例引入 from lxml import etree # 构造一个etree的HTML节点对象(可供Xpath解析) html = etree.HTML(text) # 读取text文本内容进行构造节点对象 html2 = etree.parse('./text', etree.HTMLParser()) # 用tostring方法可以修正html代码,如上面代码缺失的</a>标签 result = etree.tostring(html) result2 = etree.tostring(html2) print(html, html2) print(type(html), type(html2)) ''' 输出内容: <Element html at 0x2b47848> <lxml.etree._ElementTree object at 0x0000000002B47788> <class 'lxml.etree._Element'> <class 'lxml.etree._ElementTree'> ''' # 输出修正后的html代码 print(result.decode('utf-8')) print(result2.decode('utf-8'))
2. 提取页面下的所有节点
## 提取页面下的所有节点 from lxml import etree html = etree.HTML(text) result = html.xpath('//*') print(len(result)) print(result) ''' 输出结果: 8 [<Element html at 0x2b539c8>, <Element body at 0x2b53948>, <Element ul at 0x2b53a08>, <Element li at 0x2b53a48>, <Element a at 0x2b53a88>, <Element a at 0x2b53b08>, <Element a at 0x2b53b48>, <Element i at 0x2b53b88>] '''
3. 提取子节点
## 提取子节点 from lxml import etree html = etree.parse('./text', etree.HTMLParser()) # 通过/寻找li标签下的直接a子节点 result = html.xpath('//li/a') # 通过//寻找ul标签下的a子和孙节点 result2 = html.xpath('//ul//a') print(len(result), len(result2)) print(result, result2) ''' 运行结果: 3 3 [<Element a at 0x2963cc8>, <Element a at 0x2963d08>, <Element a at 0x2963d48>] [<Element a at 0x2963cc8>, <Element a at 0x2963d08>, <Element a at 0x2963d48>] '''
4. 提取父节点
## 提取父节点 from lxml import etree html = etree.HTML(text) # 提取li节点中role属性为menuitem的节点的父节点的name属性内容 result = html.xpath('//li[@role="menuitem"]/../@name') print(result) ''' 输出结果: ['liebiao'] '''
5. 属性匹配
## 属性匹配 html = etree.HTML(text) # 匹配data-dataid为222878的节点 result = html.xpath('//a[@data-dataid="222878"]') print(result) ''' 输出内容: [<Element a at 0x2973c48>] '''
6. 提取文本内容
## 提取文本内容 html = etree.HTML(text) # 匹配data-dataid为222878的节点的文本内容 result = html.xpath('//a[@data-dataid="222878"]/text()') print(result) ''' 输出内容: ['家电'] '''
7. 属性值获取
## 属性获取 from lxml import etree html = etree.HTML(text) result = html.xpath('//li/@aria-label') print(result) ''' 输出内容: ['查看更多'] '''
8. 属性多值匹配
## 属性多值匹配 from lxml import etree html = etree.HTML(text) result = html.xpath('//li[@class="J_Cat"]') result2 = html.xpath('//li[@class="J_Cat a-all"]//text()') result3 = html.xpath('//li[contains(@class, "J_Cat")]//text()') print(result, result2, result3) ''' 输出结果: [] ['\n', '家电', '\n', '数码', '\n', '手机\n\n', '\ue62e', '\n'] ['\n', '家电', '\n', '数码', '\n', '手机\n\n', '\ue62e', '\n'] '''
9. 多属性匹配
## 多属性匹配 ## 运算符介绍 # or 或 # and 与 # mod 除余 # | 返回节点集合 # + 加法 # - 减法 # * 乘法 # = 等于 # != 不等于 # < 小于 # <= 小于或等于 # > 大于 # >= 大于或等于 from lxml import etree html = etree.HTML(text) result = html.xpath('//li[contains(@class, "J_Cat") and @role="menuitem"]/a/text()') print(result) ''' 输出结果: ['家电', '数码', '手机\n\n', '\n'] '''
10. 按序选择,通过索引的方式进行选择
## 按序选择,通过索引的方式进行选择 from lxml import etree html = etree.HTML(text) # 提取li节点下第一个a节点的文本内容 print(html.xpath('//li/a[1]/text()')) # 提取li节点下最后一个a节点的文本内容 print(html.xpath('//li/a[last()]/text()')) # 提取li节点下位置小于3的a节点的文本内容 print(html.xpath('//li/a[position()<3]/text()')) # 提取li节点下倒数第2个a节点的文本内容 print(html.xpath('//li/a[last()-1]/text()')) ''' 输出结果: ['手机\n\n', '\n'] ['家电', '数码'] ['数码'] '''
11. 节点轴选择
## 节点轴选择 # ancestor轴,可以节点获取所有的祖先节点 # attribute轴,可以获取节点的所有属性值 # child轴,可以获取节点的所有直接子节点 # descendant轴,可以获取节点的所有子孙节点 # following轴,可以获取节点后的所有节点 # following-sibling,可以获取当前节点的所有同级节点 from lxml import etree html = etree.HTML(text) print(html.xpath('//li/a[1]/ancestor::*')) print(html.xpath('//li/a[1]/ancestor::ul')) print(html.xpath('//li/a[1]/attribute::*')) print(html.xpath('//li[1]/child::*')) print(html.xpath('//ul[1]/descendant::a')) print(html.xpath('//a[1]/following::*')) print(html.xpath('//a[1]/following-sibling::*')) ''' 输出结果: [<Element html at 0x2b53b88>, <Element body at 0x2b53b48>, <Element ul at 0x2b53d88>, <Element li at 0x2b53bc8>] [<Element ul at 0x2b53b48>] ['1', '222878'] [<Element a at 0x2b53b48>, <Element a at 0x2b53d88>, <Element a at 0x2b53bc8>] [<Element a at 0x2b53b48>, <Element a at 0x2b53d88>, <Element a at 0x2b53bc8>] [<Element a at 0x2b53b48>, <Element a at 0x2b53d88>, <Element i at 0x2b53bc8>] [<Element a at 0x2b53d88>, <Element a at 0x2b53bc8>] '''
12. 用Xpath解析爬取豆瓣top250
### 用Xpath解析爬取豆瓣top250 from lxml import etree import requests, json def get_page(url): ''' 获取url网页代码 :param url: 要爬取的网址 :return: 网页代码 ''' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36' } response = requests.get(url, headers=headers) if response.status_code == 200: print('get page success...') return response.text else: exit('get page fail...') def parse_page(text): ''' 解析豆瓣电影top250网页代码 :param html: 网页代码 :return: data需要爬取的数据 ''' html = etree.HTML(text) items = html.xpath('//ol[@class="grid_view"]/li/div[@class="item"]') for item in items: #print(item.xpath('.//div[@class="bd"]/p[@class="quote"]/span/text()')[0]) yield { 'img': item.xpath('.//div[@class="pic"]//img/@src')[0], 'details': item.xpath('.//div[@class="hd"]/a/@href')[0], 'name': item.xpath('.//div[@class="hd"]//span[1]/text()')[0], 'director': item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[0].split()[1], 'actor': item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[0].split()[5] if len(item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[0].split())>5 else 'None', 'time': item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[1].split()[0], 'nation': item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[1].split()[2], 'type': item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[1].split()[4:], 'score': item.xpath('.//div[@class="bd"]/div/span[@class="rating_num"]/text()')[0], 'introduction': item.xpath('.//div[@class="bd"]/p[@class="quote"]/span/text()') if item.xpath('.//div[@class="bd"]/p[@class="quote"]/span/text()') else 'None', } return items def save_to_file(data): ''' 保存爬取到的数据到文本文件中 :param data: :return: ''' with open('豆瓣电影top250.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(data, ensure_ascii=False) + '\n') def main(start): url = 'https://movie.douban.com/top250?start=' + str(start) text = get_page(url) data = parse_page(text) for item in data: print(item) save_to_file(item) if __name__ == '__main__': for i in range(10): start = i * 25 main(start)
静静的学习一阵子儿...