python3之BeautifulSoup
# -*- coding:utf-8 -*- # !/user/bin env python3 from bs4 import BeautifulSoup html = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ #标签选择器 soup = BeautifulSoup(html,"html.parser") """ print(soup.title) print(soup.title.name) print(soup.title.text) print(soup.head) print(soup.p) print(soup.a) #获取属性 print("====获取属性====") print(soup.p["name"]) print(soup.p.attrs["name"]) #获取内容 print("====获取内容====") print(soup.p.string) print(soup.p.text) #获取子节点,content属性将子节点以列表的形式输出,可以通过列表索引来获取元素 print("====获取子节点====") print(soup.p.contents) print(soup.p.contents[0]) print(soup.p.parent) print(soup.p.parent.name) content=soup.head.title.string for parent in content.parents: print(parent.name) """ #标准选择器 """ print(soup.find_all("a")[0]) print(soup.find_all("a")) print(soup.find_all(attrs={'name':'dromouse'})) print(soup.find_all(class_ = "title")) print(soup.find_all(text = "The Dormouse's story")) print(soup.find_all("a",limit = 1)) print(soup.find("a")) """ #css选择器 # print(soup.select(".story .sister")) #打印class属性为story的标签中class属性为sister的元素,需要加点 # print(soup.select("p a")) #打印p标签中a标签的内容 # print(soup.select("#link2")) #打印id为link2标签的内容 # print(soup.select("p")[0]) #打印的是第一个p标签的内容 # print(soup.select("p")[0]["class"]) #打印第一个p标签中class属性的名称 # print(soup.select("p")[0].get_text) # print(soup.select("p")[0].text) print(soup.prettify())
注意:
(1)当class中带有空格时,select时可以加点进行处理;
例如:<td class="text text-danger">0</td>,可以分别按照下面方法来处理
print(soup.find_all(class_="text text-danger")[0].text)
print(soup.select(".text.text-danger")[0].text)