pyquery模块
#_*_coding:utf-8_*_ __author__ = 'Linhaifeng' ''' 强大而又灵活的网页解析库,如果你觉得正则写起来太麻烦,如果你觉得beutifulsoup 语法太难记,如果你熟悉jquery的语法,那么pyquery是最佳选择 安装pyquery pip3 install pyquery ''' html=''' </div><div class="account-signin"> <ul class="navigation menu" aria-label="Social Media Navigation"> 哈哈哈 <li class="tier-1 last" aria-haspopup="true"> <a href="/accounts/login/" title="Sign Up or Sign In to Python.org">Sign In</a> <ul class="subnav menu"> <li class="tier-2 element-1" role="treeitem"><a href="/accounts/signup/">Sign Up / Register</a></li> <li class="tier-2 element-2" role="treeitem"><a href="/accounts/login/">Sign In</a></li> </ul> </li> </ul> </div> ''' #用法: #1===========>初始化 #===>字符串初始化 # from pyquery import PyQuery as pq # doc=pq(html) # print(doc('.tier-2')) #默认就是css选择器 #===>url初始化 # from pyquery import PyQuery as pq # doc=pq(url='http://www.baidu.com') # print(doc('head')) #===>文件初始化 # from pyquery import PyQuery as pq # doc=pq(filename='demo.html') # print(doc('li')) #2===========>基本css选择器 from pyquery import PyQuery as pq doc=pq(html) # print(doc('.tier-2')) #默认就是css选择器 #查找元素 #子元素 # print(doc('li').find('li')) #这里的find是查找所有,但是不一定是直接子元素 # print('==>',doc('li').children('li')) #查找直接子元素 #父元素 # print(doc('.tier-2').parent()) #祖先元素:爹,爹的爹 # print(doc('.tier-2').parents()) # print(doc('.tier-2').parents('.account-signin')) #从祖先里筛选 #先补充:并列选择 # print(doc('.tier-1 .tier-2')) # print(doc('.tier-1 .tier-2.element-1')) #兄弟元素 # print(doc('.tier-2.element-1').siblings()) # print(doc('.tier-2.element-1').siblings('li a')) #3===========>遍历 # lis=doc('li').items() # print(lis) # # for i,j in enumerate(lis): # print(i,j) #4===========>获取属性 # print(doc('li').attr('class')) # print(doc('a').attr.href) # 5===========>获取文本 # print(doc('a').text()) #6===========>获取html # print(doc('.subnav.menu')) # print(doc('.subnav.menu').html()) #7===========>DOM #addclass,removeclass # tag=doc('.subnav.menu') # print(tag) # # tag.addClass('active') # print(tag) # # tag.removeClass('active') # print(tag) # tag=doc('.tier-2.element-1 a') # tag.attr('name','link') # tag.css('font-size','14px') # print(tag) tag=doc('.navigation.menu') # print(tag.text()) #获取的是tag下所有的文本, tag.find('li').remove() print(tag.text()) #如果指向获取url下的那个"哈哈哈",则需要先删除li #8===========>pyquery官网 # http://pyquery.readthedocs.io/en.latest/api.html #9===========>伪类选择器 print(doc('li:first-child')) #选择li标签的第一个 print(doc('li:last-child')) #选择li标签的最后一个 print(doc('li:nth-child(2)')) #选择li标签的第2个 print(doc('li:gt(2)')) #选择li标签第2个以后的 print(doc('li:nth-child(2n)')) #选择li标签的偶数标签 print(doc('li:nth-child(2n+1)')) #选择li标签的奇数标签 print(doc('li:contains(second)')) #选择li标签中包含second文本的标签 #更多css选择器可以查看 # http://www.w3school.com.cn/css/index.asp #官网:http://pyquery.readthedocs.io/