[b0023] python 归纳 (九)_html解析-lxml
# -*- coding: utf-8 -*- """ 学习lxml解析网页 程序功能: 解析 360影视 电影排行榜中的信息 https://www.360kan.com/dianying/index.html lxml简述: 一次从 文件或者字符串读取 html获xml内容 当做一整颗 element树, 子节点也由 element组成,有前、后、父、子元素的概念 用到xpath 用法: 1 导入轮子 from lxml import etree 2 创建root节点元素树 doc_Element_obj = etree.HTML(content_str) 3 chrome浏览器获得网页上想要数据的元素xpath特征 4 方法xpath 返回子元素列表, 方法getnext,getprevious,getparent 获取下一个、前一个、父 元素 5 元素自身信息获取 element.tag,element.attrib,element.text 元素名,属性字典,元素内容 备注: xpath 语法简述: . 当前 .. 父 / 跟路径 // 相对路径 element[@attrib='value'] 属性 参考: Python解析html的几种操作方式 https://blog.csdn.net/u010154424/article/details/52273868 """ import urllib2 import StringIO import gzip from lxml import etree def lxml_parser2(content_str): """ 学习lxml使用 解析网页内容,获取电影排行榜的信息 :param content_str: html内容 :return: 无 """ # 返回root节点element对象 doc_Element_obj = etree.HTML(content_str) # 其它相似方法 etree.XML etree.fromstring etree.parse('filename.xml') # 使用 xpath 获得元素列表,这里只有一个 movierank_Element_list = doc_Element_obj.xpath('//div[@class="m-rank g-wrap p-mod js-rank"]') e = movierank_Element_list[0] # 查看元素内容 # print etree.tostring(movierank_Element_list[0],encoding="utf8") # 获取元素属性值,没有None # attribute_value_str = e.get("class") ### 获取子元素 # 继续使用 xpath # a 查找 # u_o = e.find(".//li[@class='w-newfigure']") # 返回一个 # u_o = e.findall(".//li") # 列表 # print [ b.attrib for b in e.iterfind(".//li") ] # 可迭代对象 # b 遍历 # for u in e.getiterator(): # 把元素当做一颗树,遍历所有结点 # print u.text # for u in e.getiterator("li"): # 把元素当做一颗树,遍历所有结点,包括孙子节点以下 # print u.tag,u.attrib['title'] # c 遍历 # for element in e.iter("*"): # print element.tag,element.attrib,element.text # 遍历元素下所有的li,包括孙子节点 # for element in e.iter("li"): # print element.tag,element.attrib,element.text ### 获取周围元素 前、后、父 # print e.getnext().tag,e.getnext().attrib,e.getnext().text # print e.getprevious().tag, e.getprevious().attrib, e.getprevious().text # print e.getparent().tag, e.getparent().attrib, e.getparent().text def lxml_parser(page): """ 解析网页内容,获取电影排行榜的信息 :param page: html内容 :return: 无 """ doc_Element_obj = etree.HTML(page) # 获得电影排行榜元素 借助chrome浏览器 movierank_Element_list = doc_Element_obj.xpath('//div[@class="m-rank g-wrap p-mod js-rank"]') e = movierank_Element_list[0] u_Element_list = e.xpath(".//ul") for u in u_Element_list: li_Element_list = u.getchildren() for li in li_Element_list: print li.attrib['title'] # 获取属性 <li title='xxxx'> print li.xpath('.//span[@class="w-newfigure-hint"]')[0].text print li.xpath('.//div[@class="w-newfigure-imglink g-playicon js-playicon"]/span')[1].text print li.xpath('.//div[@class="w-newfigure-imglink g-playicon js-playicon"]/img')[0].attrib['src'] print li.xpath('.//span[@class="s1"]')[0].text print li.xpath('.//span[@class="s2"]')[0].text print li.xpath('.//p[@class="w-newfigure-desc"]')[0].text def get_html(url, paraser): """ 爬虫获取在线web html,然后解析 :param url: :param paraser: :return: """ headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Host': 'www.360kan.com', 'Proxy-Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' } request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) response.encoding = 'utf-8' # 新增属性 if response.code == 200: data_obj = StringIO.StringIO(response.read()) # response 返回的显示乱码 gzipper = gzip.GzipFile(fileobj=data_obj) data = gzipper.read() # 得到可读的html文档 value = paraser(data) return value else: pass def get_html_fromfile(filepath, paraser): """ 从文件获取html, 然后解析 :param filepath: html文件路径 :param paraser: 解析函数 :return: """ import codecs f_o = codecs.open(filepath, encoding='UTF-8') data = f_o.read() value = paraser(data) return value # 从本地文件获取html value = get_html_fromfile("360_online.html", paraser=lxml_parser) # 从网络获取html # value = get_html('https://www.360kan.com/dianying/index.html', paraser=lxml_parser2)
输入:
360 影视 html https://www.360kan.com/dianying/index.html
解析代码
<div class="m-rank g-wrap p-mod js-rank" data-block="tj-rank_list" monitor-desc="电影排行榜"> <div class="p-mod-title g-clear"> <span class="p-mod-label">电影排行榜</span> <a href="javascript:void(0);" class="p-mod-tab js-tab on">经典榜</a> <a href="javascript:void(0);" class="p-mod-tab js-tab ">热播榜</a> <a href="javascript:void(0);" class="p-mod-tab js-tab ">全球榜</a> </div> <div class="content"> <ul class="w-newfigure-list g-clear js-view on"> <li title="审死官" class="w-newfigure"><a href="https://www.360kan.com/m/fariaEktdHX5SB.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p.ssl.qhimg.com/t0149cf9d603d253723.jpg" data-src="https://p.ssl.qhimg.com/t0149cf9d603d253723.jpg" alt="审死官"><span class="w-newfigure-hint">1992</span><span class="w-newfigure-rank top3">1</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">审死官</span><span class="s2">7.5</span></p><p class="w-newfigure-desc">古代大律师扮死人判案</p></div></a></li><li title="旺角卡门" class="w-newfigure"><a href="https://www.360kan.com/m/gqrlaEMmd0r2SB.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p.ssl.qhimg.com/t01ab41149a4385d335.jpg" data-src="https://p.ssl.qhimg.com/t01ab41149a4385d335.jpg" alt="旺角卡门"><span class="w-newfigure-hint">1988</span><span class="w-newfigure-rank top3">2</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">旺角卡门</span><span class="s2">7.6</span></p><p class="w-newfigure-desc">华仔不为人知的私密情史</p></div></a></li><li title="警察故事1" class="w-newfigure"><a href="https://www.360kan.com/m/f6TkZUUrQnP1TR.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p.ssl.qhimg.com/t018acd81e6dc47d04e.jpg" data-src="https://p.ssl.qhimg.com/t018acd81e6dc47d04e.jpg" alt="警察故事1"><span class="w-newfigure-hint">1985</span><span class="w-newfigure-rank top3">3</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">警察故事1</span><span class="s2">7.7</span></p><p class="w-newfigure-desc">正义警察被歹徒诬陷遭通缉</p></div></a></li><li title="东方三侠" class="w-newfigure"><a href="https://www.360kan.com/m/fqPnaUoqQHH0UB.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p.ssl.qhimg.com/t01cbac090d05c866d4.jpg" data-src="https://p.ssl.qhimg.com/t01cbac090d05c866d4.jpg" alt="东方三侠"><span class="w-newfigure-hint">1993</span><span class="w-newfigure-rank ">4</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">东方三侠</span><span class="s2">7.3</span></p><p class="w-newfigure-desc">三姐妹大战猥琐老太监</p></div></a></li><li title="武状元苏乞儿" class="w-newfigure"><a href="https://www.360kan.com/m/f6rpZkotQHHASh.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p.ssl.qhimg.com/t01667b6078fb09976c.jpg" data-src="https://p.ssl.qhimg.com/t01667b6078fb09976c.jpg" alt="武状元苏乞儿"><span class="w-newfigure-hint">1992</span><span class="w-newfigure-rank ">5</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">武状元苏乞儿</span><span class="s2">7.5</span></p><p class="w-newfigure-desc">星爷练成丐帮绝世武功</p></div></a></li><li title="白发魔女传" class="w-newfigure"><a href="https://www.360kan.com/m/faPnZ0Mme3n7UB.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p.ssl.qhimg.com/t0161723bf1ce91597d.jpg" data-src="https://p.ssl.qhimg.com/t0161723bf1ce91597d.jpg" alt="白发魔女传"><span class="w-newfigure-hint">1993</span><span class="w-newfigure-rank ">6</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">白发魔女传</span><span class="s2">7.7</span></p><p class="w-newfigure-desc">魔女勇敢追求自己的幸福</p></div></a></li> </ul> <ul class="w-newfigure-list g-clear js-view "> <li title="英伦对决" class="w-newfigure"><a href="https://www.360kan.com/m/gaXjaRH4QHr1TB.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p0.ssl.qhimg.com/d/_hao360/default.png" data-src="https://p.ssl.qhimg.com/t01d6321d62cb9d3bc7.jpg" alt="英伦对决"><span class="w-newfigure-hint">2017</span><span class="w-newfigure-rank top3">1</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">英伦对决</span><span class="s2">7.1</span></p><p class="w-newfigure-desc">成龙徒手造炸弹</p></div></a></li><li title="辣妹甜心" class="w-newfigure"><a href="https://www.360kan.com/m/gqXmZRH5QHX5Th.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p0.ssl.qhimg.com/d/_hao360/default.png" data-src="https://p.ssl.qhimg.com/t0151eb7e89e5b209e9.jpg" alt="辣妹甜心"><span class="w-newfigure-hint">2018</span><span class="w-newfigure-rank top3">2</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">辣妹甜心</span><span class="s2">6.0</span></p><p class="w-newfigure-desc">塑料姐妹花爆笑换身</p></div></a></li><li title="英雄本色2018" class="w-newfigure"><a href="https://www.360kan.com/m/hKbiYRH4RHfASR.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p0.ssl.qhimg.com/d/_hao360/default.png" data-src="https://p.ssl.qhimg.com/t01f25dd05ae288fcc3.jpg" alt="英雄本色2018"><span class="w-newfigure-hint">2018</span><span class="w-newfigure-rank top3">3</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">英雄本色2018</span><span class="s2">4.7</span></p><p class="w-newfigure-desc">江湖大哥码头火拼</p></div></a></li><li title="机器之血" class="w-newfigure"><a href="https://www.360kan.com/m/hKXlZRH4QXf6UR.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p0.ssl.qhimg.com/d/_hao360/default.png" data-src="https://p.ssl.qhimg.com/t01ba1b1e43ef11b171.jpg" alt="机器之血"><span class="w-newfigure-hint">2017</span><span class="w-newfigure-rank ">4</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">机器之血</span><span class="s2">4.7</span></p><p class="w-newfigure-desc">大哥血性搏杀</p></div></a></li><li title="灵魂摆渡黄泉" class="w-newfigure"><a href="https://www.360kan.com/m/hqXqYhH4Rnb2SR.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p0.ssl.qhimg.com/d/_hao360/default.png" data-src="https://p.ssl.qhimg.com/t01a02b0e9ac491628b.jpg" alt="灵魂摆渡黄泉"><span class="w-newfigure-hint">2018</span><span class="w-newfigure-rank ">5</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">灵魂摆渡黄泉</span><span class="s2">7.1</span></p><p class="w-newfigure-desc">少女误入孟婆庄</p></div></a></li><li title="绝色霸王花" class="w-newfigure"><a href="https://www.360kan.com/m/hqflahH5QHH1TR.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p0.ssl.qhimg.com/d/_hao360/default.png" data-src="https://p.ssl.qhimg.com/t017cc748788eb1dd9e.jpg" alt="绝色霸王花"><span class="w-newfigure-hint">2018</span><span class="w-newfigure-rank ">6</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">绝色霸王花</span><span class="s2">6.3</span></p><p class="w-newfigure-desc">美女警花解救人质</p></div></a></li> </ul> <ul class="w-newfigure-list g-clear js-view "> <li title="天才枪手" class="w-newfigure"><a href="https://www.360kan.com/m/hKvrZRH4RHH6Sx.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p0.ssl.qhimg.com/d/_hao360/default.png" data-src="https://p.ssl.qhimg.com/t0164b504446f495860.jpg" alt="天才枪手"><span class="w-newfigure-hint">2017</span><span class="w-newfigure-rank top3">1</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">天才枪手</span><span class="s2">8.1</span></p><p class="w-newfigure-desc">学霸为钱突破道德底线</p></div></a></li><li title="寂静之地" class="w-newfigure"><a href="https://www.360kan.com/m/gafqZRH4SHT0SR.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p0.ssl.qhimg.com/d/_hao360/default.png" data-src="https://p.ssl.qhimg.com/t016d389e50ad5d462d.jpg" alt="寂静之地"><span class="w-newfigure-hint">2018</span><span class="w-newfigure-rank top3">2</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">寂静之地</span><span class="s2">6.3</span></p><p class="w-newfigure-desc">发出声音就得死</p></div></a></li><li title="碟中谍5:神秘国度" class="w-newfigure"><a href="https://www.360kan.com/m/g6ToYxH2Q0H5TR.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p0.ssl.qhimg.com/d/_hao360/default.png" data-src="https://p.ssl.qhimg.com/t0155ff14e2610b8969.jpg" alt="碟中谍5:神秘国度"><span class="w-newfigure-hint">2015</span><span class="w-newfigure-rank top3">3</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">碟中谍5:神秘国度</span><span class="s2">7.7</span></p><p class="w-newfigure-desc">阿汤哥对战神秘组织</p></div></a></li><li title="机械师2:复活" class="w-newfigure"><a href="https://www.360kan.com/m/fqPlahH3RXP5UB.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p0.ssl.qhimg.com/d/_hao360/default.png" data-src="https://p.ssl.qhimg.com/t01b0c42db6761cc415.jpg" alt="机械师2:复活"><span class="w-newfigure-hint">2016</span><span class="w-newfigure-rank ">4</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">机械师2:复活</span><span class="s2">5.6</span></p><p class="w-newfigure-desc">斯坦森暴力解救女友</p></div></a></li><li title="摔跤吧!爸爸" class="w-newfigure"><a href="https://www.360kan.com/m/gKriYhH4QXH3Tx.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p0.ssl.qhimg.com/d/_hao360/default.png" data-src="https://p.ssl.qhimg.com/t01e50eec3e07bcbd3d.jpg" alt="摔跤吧!爸爸"><span class="w-newfigure-hint">2017</span><span class="w-newfigure-rank ">5</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">摔跤吧!爸爸</span><span class="s2">9.1</span></p><p class="w-newfigure-desc">米叔变身狼爸</p></div></a></li><li title="金蝉脱壳2" class="w-newfigure"><a href="https://www.360kan.com/m/habiYhH4SHr7UR.html" class="js-link"><div class="w-newfigure-imglink g-playicon js-playicon"> <img src="https://p0.ssl.qhimg.com/d/_hao360/default.png" data-src="https://p.ssl.qhimg.com/t01c1267d1730a33a54.jpg" alt="金蝉脱壳2"><span class="w-newfigure-hint">2018</span><span class="w-newfigure-rank ">6</span></div><div class="w-newfigure-detail"><p class="title g-clear"><span class="s1">金蝉脱壳2</span><span class="s2">3.1</span></p><p class="w-newfigure-desc">硬汉集结一起越狱</p></div></a></li> </ul> </div> </div>
输出 :
审死官 1992 1 ./360电影频道-更新更全更受欢迎的影视网站-在线观看_files/t0149cf9d603d253723.jpg 审死官 7.5 古代大律师扮死人判案 旺角卡门 1988 2 ./360电影频道-更新更全更受欢迎的影视网站-在线观看_files/t01ab41149a4385d335.jpg 旺角卡门 7.6 华仔不为人知的私密情史 警察故事1 1985 3 ./360电影频道-更新更全更受欢迎的影视网站-在线观看_files/t018acd81e6dc47d04e.jpg 警察故事1 7.7 正义警察被歹徒诬陷遭通缉 东方三侠 1993 4
参考资料:
lxml.etree._Element 属性:
class:<type 'lxml.etree._Element'> <Element div at 0x31a4608> method: ['__copy__' '__deepcopy__' '__format__' '__new__' '__reduce__' '__reduce_ex__' '__reversed__' '__sizeof__' '__subclasshook__' '_init' 'addnext' 'addprevious' 'append' 'clear' 'extend' 'find' 'findall' 'findtext' 'get' 'getchildren' 'getiterator' 'getnext' 'getparent' 'getprevious' 'getroottree' 'index' 'insert' 'items' 'iter' 'iterancestors' 'iterchildren' 'iterdescendants' 'iterfind' 'itersiblings' 'itertext' 'keys' 'makeelement' 'remove' 'replace' 'set' 'values' 'xpath'] import method: [] method-wrapper: ['__contains__' '__delattr__' '__delitem__' '__getattribute__' '__getitem__' '__hash__' '__init__' '__iter__' '__len__' '__nonzero__' '__repr__' '__setattr__' '__setitem__' '__str__'] module: [] class: ['__class__'] classobj: [] import class: [] attribute: ['__doc__' 'attrib' 'base' 'nsmap' 'prefix' 'sourceline' 'tag' 'tail' 'text'] attrib type: ['__doc__:str' 'attrib:lxml.etree._Attrib' 'base:NoneType' 'nsmap:dict' 'prefix:NoneType' 'sourceline:int' 'tag:str' 'tail:str' 'text:str']
写满200篇博文再说