bs4遍历文档树+bs4搜索文档树


    
# cicd，后端项目高可用，redis高可用，权限管理表设计
    -持续集成持续部署  jenkins
  -开发写完代码---》gitlab---》jenkins定时从gitlab拉取代码---》编译---》把可执行文件---》测试服务器（docker仓库）---》供测试去测
  -项目高可用是nginx做负载
  -keepalive -->nginx
  -rbac
  
# MySQL用MVCC还需要用乐观锁吗？delete, insert, update不是会自动加锁吗？

   -乐观锁悲观锁 应用程序来讲，不是站在数据库层面
  -创建一个订单： 10  在订单表插入数据，商品数量-1


# 字典，列表底层
    -{key:value}--->底层数组---》连续存储的内存空间
  -根据key去通过hash函数运算---》得到一个数字---》存到数组对应的位置
  -字典的key值要可hash--》数字，字符串，元组---》不可变类型
  -一个对象是否可hash---》魔法方法__hash__决定的  hash(对象)
    -hash冲突解决：开放定址法，再哈希法，链地址法
  
  -字典底层

使用数组存储---》通过对key值的hash操作--》存到数组中----》所以key值必须可hash(不可变数据类型)---》一旦hash，就可能出现hash冲突---》开放地址，再hash，链地址---》python用链地址解决冲突

  
  -列表底层---》也是数组

   -数据存在数组中---》插入，删除 时间复杂度是o(n)
    -在最后位置插入---》o(1)  o(n)
    -取数据时间复杂度 o(1)


    
    
 -深浅拷贝

   -python中一切皆对象---》一切皆地址
  deepcopy  copy  直接赋值
  
  a=[1,2,[1,2,3]]
  b=a
  c=copy(a)
  d=deepcopy(a)

1 bs4 遍历文档树


from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's <em>lqz<em> story</title></head>
<body>
<p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup=BeautifulSoup(html_doc,'lxml')


#1  遍历文档树---》按  .     去搜索
# head=soup.head
# print(head)
# title=head.title
# print(title)


# 2 获取标签名
# res=soup.p.name
# print(type(res))

# 3 获取标签属性
# res=soup.a.attrs['href']
# print(res)


# 4 获取标签的内容
# res=soup.head.title.strings
# text: 该标签的文本内容,把子子孙孙标签内容都拿出来
# string：None，有且只有一个标签(没有子子孙孙),才能取出来
# strings：generator 生成器，把子子孙孙的文本内容放在里面
# print(list(res))

# 5 嵌套选择
# print(soup.head.title.string)
# print(soup.body.a.string)

#6、子节点、子孙节点
# print(soup.p.contents) #p下所有子节点，放到列表中（不包含孙）
# print(soup.p.children) #得到一个迭代器,包含p下所有子节点，（不包含孙）
# print(soup.p.descendants) #获取子孙节点,p下所有的标签都会选择出来


#7、父节点、祖先节点
# print(soup.a.parent) #获取a标签的父节点，直接父亲
# print(list(soup.a.parents)) #找到a标签所有的祖先节点，父亲的父亲，父亲的父亲的父亲...

#8、兄弟节点

# print(soup.a.next_sibling) #下一个兄弟,一个
# print(soup.a.previous_sibling) #上一个兄弟，一个
#
# print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象
# print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象


# 通过 . 去查找找到的第一个标签，获取标签的内容(text)和属性(attrs)

2 bs4 搜索文档树




from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's <em>lqz<em> story</title></head>
<body>
<p id="my p" class="title"><b id="id_bbb" class="boldest">The</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup=BeautifulSoup(html_doc,'lxml')
# 效率会低于遍历文档树
#五种过滤器: 字符串、正则表达式、列表、True、方法
# find：找到第一个
# find_all：找到所有

# 1 字符串 -->名字，类，属性名 ---》都是字符串
# b=soup.find(name='b')
# b=soup.find(class_='boldest')
# b=soup.find(id='bbb')
# b=soup.find(text="The")

# a=soup.find(href='http://example.com/elsie')
# b=soup.find(name='b',class_='boldest')  # 名字是b，并且类名是blodest
# print(b)



# 2 正则表达式
# import re
# # res=soup.find_all(name=re.compile('^b')) #找出b开头的标签，结果有body和b标签
# # print(res)
# # 找出页面中所有连接
# # res=soup.find_all(href=re.compile('^http:')) #找出b开头的标签，结果有body和b标签
# # print(res)
#
# # 找出所有id以id开头的标签
# res=soup.find_all(id=re.compile('^id_')) #找出b开头的标签，结果有body和b标签
# print(res)

# 3 列表
# res=soup.find_all(name=['body','p']) #找出b开头的标签，结果有body和b标签
# res=soup.find_all(class_=['sister','story']) #找出b开头的标签，结果有body和b标签
# print(res)

# 4 布尔
# res=soup.find_all(class_=True)   # 找到所有有类名的标签
# res=soup.find_all(id=True)   # 找到所有有id的标签
# res=soup.find_all(name='img',src=False)   # 找到所有没有id的标签
# res=soup.find_all(href=True)
# res=soup.find_all(src=True)
# print(res)

# 5 方法
# 找到所有有class，但是没有id的标签
# def has_class_but_no_id(tag):
#     return tag.has_attr('class') and not tag.has_attr('id')
#
# res=soup.find_all(name=has_class_but_no_id)
# print(res)
# print(len(res))



## 其他参数  attrs , recursive   limit

# 标签所有属性都可以放到attrs中
# res=soup.find_all(attrs={'class':True})
# print(res)
# res=soup.find_all(class_=True)
# print(res)

# recursive :是否递归查找,查找子子孙
# res=soup.find_all(class_=True,recursive=False) # 速度快，但是只会找一层
# res=soup.find_all(class_=True,recursive=True)
# print(res)

# limit:只找符合条件的第一个就是find了
# soup.find()
# res=soup.find_all(class_=True,recursive=True,limit=1)



### 遍历文档树和搜索文档树可以混用
body=soup.body.find(id='id_bbb')  # 速度快
body=soup.find(id='id_bbb')

3 css选择器






from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's <em>lqz<em> story</title></head>
<body>
<p id="my p" class="title"><b id="id_bbb" class="boldest">The</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup=BeautifulSoup(html_doc,'lxml')


# css选择器 ---》
# .类名
# #id
# div>p
# div p
# res=soup.select("#id_bbb")
res=soup.select('body p.story')
print(res)

# 终极大招：去复制

# 页面中只要看到的东西，都能用bs4解析出来--》存到库中

4 测试


import requests
from bs4 import BeautifulSoup
res=requests.get('https://www.cnblogs.com/liuqingzheng/p/16005875.html')
# print(res.text)
soup=BeautifulSoup(res.text,'lxml')
# title=soup.find(name='a',id='Header1_HeaderTitle').text
# print(title)

# a=soup.find(name='div',class_='postDesc').find(name='span',id='post_view_count').text
# print(a)

# 这个页面的内容，可能不全（跟浏览器看的不一样），浏览器自动执行js，请求不会--》请求需要自己解析出js请请求---》再发请求


res=soup.select('#cnblogs_post_body > p:nth-child(22) > strong')[0].text
print(res)

posted @ 2022-03-17 23:14 甜甜de微笑阅读(228) 评论(0) 编辑收藏举报

刷新页面返回顶部

甜甜de微笑

bs4遍历文档树+bs4搜索文档树

1 bs4 遍历文档树

2 bs4 搜索文档树

3 css选择器

4 测试

公告