python3之BeautifulSoup

# -*- coding:utf-8 -*-
# !/user/bin env python3
from bs4 import BeautifulSoup

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

#标签选择器

soup = BeautifulSoup(html,"html.parser")
"""
print(soup.title)
print(soup.title.name)
print(soup.title.text)
print(soup.head)
print(soup.p)
print(soup.a)
#获取属性
print("====获取属性====")
print(soup.p["name"])
print(soup.p.attrs["name"])
#获取内容
print("====获取内容====")
print(soup.p.string)
print(soup.p.text)
#获取子节点,content属性将子节点以列表的形式输出,可以通过列表索引来获取元素
print("====获取子节点====")
print(soup.p.contents)
print(soup.p.contents[0])

print(soup.p.parent)
print(soup.p.parent.name)

content=soup.head.title.string
for parent in content.parents:
    print(parent.name)
"""

#标准选择器
"""
print(soup.find_all("a")[0])
print(soup.find_all("a"))
print(soup.find_all(attrs={'name':'dromouse'}))
print(soup.find_all(class_ = "title"))
print(soup.find_all(text = "The Dormouse's story"))
print(soup.find_all("a",limit = 1))
print(soup.find("a"))
"""

#css选择器
# print(soup.select(".story .sister"))   #打印class属性为story的标签中class属性为sister的元素,需要加点
# print(soup.select("p a"))              #打印p标签中a标签的内容
# print(soup.select("#link2"))           #打印id为link2标签的内容
# print(soup.select("p")[0])              #打印的是第一个p标签的内容
# print(soup.select("p")[0]["class"])      #打印第一个p标签中class属性的名称
# print(soup.select("p")[0].get_text)
# print(soup.select("p")[0].text)
print(soup.prettify())

 注意:

(1)当class中带有空格时,select时可以加点进行处理;

      例如:<td class="text text-danger">0</td>,可以分别按照下面方法来处理

  print(soup.find_all(class_="text text-danger")[0].text)

  print(soup.select(".text.text-danger")[0].text)

 

posted @ 2021-08-13 16:03  南山散人  阅读(141)  评论(0编辑  收藏  举报