python爬虫03-BeautifulSoup4使用
BeautifulSoup4使用
#Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库 html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ #得到一个 BeautifulSoup 的对象,并能按照标准的缩进格式的结构输出: from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, 'html.parser') print(soup.prettify()) print("soup.title:", soup.title) # <title>The Dormouse's story</title> print("soup.title.name:", soup.title.name) # u'title' print(soup.title.string) # u'The Dormouse's story' print(soup.title.parent.name) # u'head' print(soup.p) # <p class="title"><b>The Dormouse's story</b></p> print(soup.p['class']) # u'title' print(soup.a) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> print(soup.find_all('a')) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] print(soup.find(id="link3")) #<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> #从文档中找到所有<a>标签的链接: for link in soup.find_all('a'): print(link.get('href')) # http://example.com/elsie # http://example.com/lacie # http://example.com/tillie #从文档中获取所有文字内容: print(soup.get_text()) #查找文档中所有的<b>标签 print(soup.find_all('b')) import re #以b开头的标签 for tag in soup.find_all(re.compile("^b")): print(tag.name) #所有名字中包含”t”的标签: for tag in soup.find_all(re.compile("t")): print(tag.name) #html title #将与列表中任一元素匹配的内容返回.下面代码找到文档中所有<a>标签和<b>标签 print(soup.find_all(["a", "b"])) #包含 class 属性却不包含 id 属性,那么将返回 True def has_class_but_no_id(tag): return tag.has_attr('class') and not tag.has_attr('id') print(soup.find_all(has_class_but_no_id)) soup.find_all("title") # [<title>The Dormouse's story</title>] soup.find_all("p", "title") # [<p class="title"><b>The Dormouse's story</b></p>] soup.find_all("title") # [<title>The Dormouse's story</title>] soup.find_all(id='link2') # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] soup.find_all(href=re.compile("elsie")) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] #所有包含 id 属性的tag,无论 id 的值是什么 soup.find_all(id=True) soup.find_all(href=re.compile("elsie"), id='link1') # [<a class="sister" href="http://example.com/elsie" id="link1">three</a>] data_soup = BeautifulSoup('<div data-foo="value">foo!</div>') data_soup.find_all(attrs={"data-foo": "value"}) # [<div data-foo="value">foo!</div>] soup.find_all("a", class_="sister") soup.find_all(class_=re.compile("itl")) def has_six_characters(css_class): return css_class is not None and len(css_class) == 6 soup.find_all(class_=has_six_characters) css_soup = BeautifulSoup('<p class="body strikeout"></p>') css_soup.find_all("p", class_="strikeout") # [<p class="body strikeout"></p>] css_soup.find_all("p", class_="body") # [<p class="body strikeout"></p>] #搜索 class 属性时也可以通过CSS值完全匹配: css_soup.find_all("p", class_="body strikeout") #通过 text 参数可以搜搜文档中的字符串内容 soup.find_all(text="Elsie") # [u'Elsie'] soup.find_all(text=["Tillie", "Elsie", "Lacie"]) # [u'Elsie', u'Lacie', u'Tillie'] soup.find_all(text=re.compile("Dormouse")) #[u"The Dormouse's story", u"The Dormouse's story"] def is_the_only_string_within_a_tag(s): ""Return True if this string is the only child of its parent tag."" return (s == s.parent.string) soup.find_all(text=is_the_only_string_within_a_tag) # [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...'] soup.find_all("a", text="Elsie") # [<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>] markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' soup = BeautifulSoup(markup, 'html.parser') print(soup.get_text()) #I linked to example.com print(soup.i.get_text()) #example.com print(soup.get_text("|")) print(soup.get_text("|", strip=True))