爬虫-beautifulsoup的使用（5）

资料准备：

from bs4 import BeautifulSoup

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>bobby基本信息</title>
    <script src="//code.jquery.com/jquery-1.11.3.min.js"></script>
</head>
<body>
    <div id="info">
        <p style="color: blue">讲师信息</p>
        <div class="teacher_info info">
            python全栈工程师，7年工作经验，喜欢钻研python技术，对爬虫、
            web开发以及机器学习有浓厚的兴趣，关注前沿技术以及发展趋势。
            <p class="age">年龄: 29</p>
            <p class="name bobbyname" data-bind="bobby bobby2">姓名: bobby</p>
            <p class="work_years">工作年限: 7年</p>
            <p class="position">职位: python开发工程师</p>
        </div>
        <p style="color: aquamarine">课程信息</p>
        <table class="courses">
          <tr>
            <th>课程名</th>
            <th>讲师</th>
            <th>地址</th>
          </tr>
          <tr>
            <td>django打造在线教育</td>
            <td>bobby</td>
            <td><a href="https://coding.imooc.com/class/78.html">访问</a></td>
          </tr>
          <tr>
            <td>python高级编程</td>
            <td>bobby</td>
            <td><a href="https://coding.imooc.com/class/200.html">访问</a></td>
          </tr>
          <tr>
            <td>scrapy分布式爬虫</td>
            <td>bobby</td>
            <td><a href="https://coding.imooc.com/class/92.html">访问</a></td>
          </tr>
          <tr>
            <td>django rest framework打造生鲜电商</td>
            <td>bobby</td>
            <td><a href="https://coding.imooc.com/class/131.html">访问</a></td>
          </tr>
          <tr>
            <td>tornado从入门到精通</td>
            <td>bobby</td>
            <td><a href="https://coding.imooc.com/class/290.html">访问</a></td>
          </tr>
        </table>
    </div>

</body>
</html>

"""

数据处理

获取标签
# bs = BeautifulSoup(html, "html.parser")
# title_tag = bs.title
# print(title_tag.string)
# 以字符串的形式输出
# div_tag = bs.div
# 仅仅获取到第一个div
# print(div_tag.string)


#find的查找实现方式，查找单个，
# div_tag = bs.find("div")
# div_tag = bs.find(id="info-955")
# div_tag = bs.find("div", id="info-955")
#如果匹配更多可以使用findall
# div_tags = bs.find_all("div")
# for tag in div_tags:
#     print(tag.string)


import re
#使用正则表达式的实现
# div_tag = bs.find("div", id=re.compile("info-\d+"))
# 也可以使用内容匹配
# div_tag = bs.find(string="bobby")
#

根据html的dom形式获取父类，子类的数据的处理方式

# 
# 获取子元素的子元素
# childrens = div_tag.descendants
# for child in childrens:
#     if child.name:
#         print(child.name)
# 
# 父元素的所有父元素
# parents = bs.find("p",{"class":"name"}).parents
# for parent in parents:
#     print(parent.name)

#获取兄弟节点
# previous_sibling = bs.find("p",{"class":"name"}).previous_sibling
# print(previous_sibling.string)
# for sibling in previous_siblings:
#     print(sibling.string)

获取属性的方式：

# 获取属性的方式
# name_tag = bs.find("p",{"class":"name"})
# class支持多值属性，返回的是一个列表
# print(name_tag["class"])
# print(name_tag.get("class"))
# 自定义的属性，返回的值是字符串
# print(name_tag["data-bind"])

缺点：学习成本高，一旦换库将意味着重新学习

优点：安装简单，学习难度较低

posted on 2020-07-17 14:11 topass123 阅读(86) 评论(0) 编辑收藏举报

道阻且长，行则将至，行而不辍，未来可期

爬虫-beautifulsoup的使用（5）