BeautifulSoup4 库的基本使用
喜欢我的博客可以加关注,有问题可以提问我。
1.基本使用(下面的html由于过长就不复制了都复用第一个)
html=""" <html> <head><title>dsojfeoifjosieofiej</title></head> <meta http-equiv="content-type" content="text/html;charset=utf-8"> <meta http-equiv="X-UA-Compatible" content="IE=Edge"> <meta content="always" name="referrer"> <meta name="theme-color" content="#2932e1"> <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon" /> <link rel="search" type="application/opensearchdescription+xml" href="/content-search.xml" title="百度搜索" /> <link rel="icon" sizes="any" mask href="//www.baidu.com/img/baidu_85beaf5496f291521eb75ba38eacbd87.svg"> <link rel="dns-prefetch" href="//s1.bdstatic.com"/> <link rel="dns-prefetch" href="//t11.baidu.com"/> <link rel="dns-prefetch" href="//t12.baidu.com"/> <link rel="dns-prefetch" href="//b1.bdstatic.com"/> """ from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.prettify()) print(soup.title.string)
2.选择元素
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.title) print(soup.head) print(soup.p)(只输出第一个)
3.获取名称
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.p.name)
4.获取属性
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.p.attrs['name']) print(soup.p['name'])
5.获取内容
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.p.string)
6.嵌套选择
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.head.title.string)
7.子节点和子孙节点
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.p.contents)#(子节点) from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.p.children) for i,child in enumerate(soup.p.children): print(i,child)#(子节点) from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.p.desccendants) for i,child in enumerate(soup.p.desccendants): print(i,child)#(子孙节点)
8.父节点和祖先节点
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.a.parent)#(父节点) from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(list(enumerate(soup.a.parents)))#(祖先节点)
9.兄弟节点
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(list(enumerate(soup.a.next_siblings))) print(list(enumerate(soup.a.previous_siblings)))
10.标准选择器
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') for ul in soup.find_all('ul'): print(ul.find_all('li'))
10.1加参数
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.find_all(attrs={'id':'list-1'})) print(soup.find_all(attrs={'name':'elements'})) from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.find_all(id='list-1')) print(soup.find_all(class_='elements'))
10.2text
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.find_all(text='Foo'))#(返回内容)
10.3 find(返回单个元素就是第一个元素)
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.find('ul')) print(type(soup.find('ul'))) print(soup.find('page'))
10.4 find_parents() find_parent()(这里和上面的类似就不粘贴代码了)
10.5 find_next_siblings() find_next_sibling()(这里和上面的类似就不粘贴代码了)
11. CSS 选择器
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.select('.panel .panel-heading'))#(选择class 为.panel 下的class 为。panel0heading的标签) print(soup.select('ul li'))#(选择标签ul 下的li标签) print(soup.select('#list-2 .element'))#(选择id为list-2 下的class为 element标签) print(type(soup.select('ul')[0]))
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') for ul in soup.select('ul'): print(ul.select('ul'))
11.1 获取属性
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') for ul in soup.select('ul'): print(ul['id']) print(ul.attrs['id'])
11.2 获取内容
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') for li in soup.select('li'): print(li.get_text())