第八次作业

.解析库之bs4:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
''''''
'''
pip3 install beautifulsoup4  # 安装bs4
pip3 install lxml  # 下载lxml解析器
'''
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>
<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
 
<p class="story">...</p>
"""
 
# 从bs4中导入BeautifulSoup
from bs4 import BeautifulSoup
 
# 调用BeautifulSoup实例化得到一个soup对象
# 参数一: 解析文本
# 参数二:
# 参数二: 解析器(html.parser、lxml...)
soup = BeautifulSoup(html_doc, 'lxml')
 
print(soup)
print('*' * 100)
print(type(soup))
print('*' * 100)
# 文档美化
html = soup.prettify()
print(html)

2.bs之遍历文档树:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>tank</b><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.<hr></hr></p><p class="story">...</p>"""
 
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')
 
'''
遍历文档树:
    1、直接使用
    2、获取标签的名称
    3、获取标签的属性
    4、获取标签的内容
    5、嵌套选择
    6、子节点、子孙节点
    7、父节点、祖先节点
    8、兄弟节点
'''
 
# 1、直接使用
print(soup.p)  # 查找第一个p标签
print(soup.a)  # 查找第一个a标签
 
# 2、获取标签的名称
print(soup.head.name)  # 获取head标签的名称
 
# 3、获取标签的属性
print(soup.a.attrs)  # 获取a标签中的所有属性
print(soup.a.attrs['href'])  # 获取a标签中的href属性
 
# 4、获取标签的内容
print(soup.p.text)  # $37
 
# 5、嵌套选择
print(soup.html.head)
 
# 6、子节点、子孙节点
print(soup.body.children)  # body所有子节点,返回的是迭代器对象
print(list(soup.body.children))  # 强转成列表类型
 
print(soup.body.descendants)  # 子孙节点
print(list(soup.body.descendants))  # 子孙节点
 
#  7、父节点、祖先节点
print(soup.p.parent)  # 获取p标签的父亲节点
# 返回的是生成器对象
print(soup.p.parents)  # 获取p标签所有的祖先节点
print(list(soup.p.parents))
 
# 8、兄弟节点
# 找下一个兄弟
print(soup.p.next_sibling)
# 找下面所有的兄弟,返回的是生成器
print(soup.p.next_siblings)
print(list(soup.p.next_siblings))
 
# 找上一个兄弟
print(soup.a.previous_sibling)  # 找到第一个a标签的上一个兄弟节点
# 找到a标签上面的所有兄弟节点
print(soup.a.previous_siblings)  # 返回的是生成器
print(list(soup.a.previous_siblings))

3.bs之搜索文档树:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
''''''
html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>tank</b><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.<hr></hr></p><p class="story">...</p>"""
'''
搜索文档树:
    find()  找一个 
    find_all()  找多个
     
标签查找与属性查找:
    标签:
            name 属性匹配
            attrs 属性查找匹配
            text 文本匹配
             
        - 字符串过滤器  
            字符串全局匹配
 
        - 正则过滤器
            re模块匹配
 
        - 列表过滤器
            列表内的数据匹配
 
        - bool过滤器
            True匹配
 
        - 方法过滤器
            用于一些要的属性以及不需要的属性查找。
 
    属性:
        - class_
        - id
'''
 
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')
 
# # 字符串过滤器
# # name
p_tag = soup.find(name='p')
print(p_tag)  # 根据文本p查找某个标签
# 找到所有标签名为p的节点
tag_s1 = soup.find_all(name='p')
print(tag_s1)
#
#
# # attrs
# # 查找第一个class为sister的节点
p = soup.find(attrs={"class": "sister"})
print(p)
# # 查找所有class为sister的节点
tag_s2 = soup.find_all(attrs={"class": "sister"})
print(tag_s2)
#
#
# # text
text = soup.find(text="$37")
print(text)
#
#
# # 配合使用:
# # 找到一个id为link2、文本为Lacie的a标签
a_tag = soup.find(name="a", attrs={"id": "link2"}, text="Lacie")
print(a_tag)
 
 
 
# # 正则过滤器
import re
# name
p_tag = soup.find(name=re.compile('p'))
print(p_tag)
 
# 列表过滤器
import re
# name
tags = soup.find_all(name=['p', 'a', re.compile('html')])
print(tags)
 
# - bool过滤器
# True匹配
# 找到有id的p标签
p = soup.find(name='p', attrs={"id": True})
print(p)
 
# 方法过滤器
# 匹配标签名为a、属性有id没有class的标签
def have_id_class(tag):
    if tag.name == 'a' and tag.has_attr('id') and tag.has_attr('class'):
        return tag
 
tag = soup.find(name=have_id_class)
print(tag)

4.爬取豌豆荚:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
'''
主页:
    图标地址、下载次数、大小、详情页地址
 
详情页:
    游戏名、图标名、好评率、评论数、小编点评、简介、网友评论、1-5张截图链接地址、下载地址
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
 
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
 
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
 
32
'''
import requests
from bs4 import BeautifulSoup
# 1、发送请求
def get_page(url):
    response = requests.get(url)
    return response
 
# 2、开始解析
# 解析主页
def parse_index(data):
    soup = BeautifulSoup(data, 'lxml')
 
    # 获取所有app的li标签
    app_list = soup.find_all(name='li', attrs={"class": "card"})
    for app in app_list:
        # print('tank *' * 1000)
        # print(app)
        # 图标地址
        img = app.find(name='img').attrs['data-original']
        print(img)
 
        # 下载次数
        down_num = app.find(name='span', attrs={"class": "install-count"}).text
        print(down_num)
 
        import re
        # 大小
        size = soup.find(name='span', text=re.compile("\d+MB")).text
        print(size)
 
        # 详情页地址
        detail_url = soup.find(name='a', attrs={"class": "detail-check-btn"}).attrs['href']
        print(detail_url)
 
 
def main():
    for line in range(1, 33):
        url = f"https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=FRsWKgWBqMBZLdxLaK4iem9B"
 
        # 1、往app接口发送请求
        response = get_page(url)
        # print(response.text)
        print('*' * 1000)
        # 反序列化为字典
        data = response.json()
        # 获取接口中app标签数据
        app_li = data['data']['content']
        # print(app_li)
        # 2、解析app标签数据
        parse_index(app_li)
 
 
if __name__ == '__main__':
    main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
'''
主页:
    图标地址、下载次数、大小、详情页地址
 
详情页:
    游戏名、好评率、评论数、小编点评、下载地址、简介、网友评论、1-5张截图链接地址、
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
 
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
 
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
 
32
'''
import requests
from bs4 import BeautifulSoup
# 1、发送请求
def get_page(url):
    response = requests.get(url)
    return response
 
# 2、开始解析
# 解析详情页
def parse_detail(text):
    soup = BeautifulSoup(text, 'lxml')
    # print(soup)
 
    # app名称
    name = soup.find(name="span", attrs={"class": "title"}).text
    # print(name)
 
    # 好评率
    love = soup.find(name='span', attrs={"class": "love"}).text
    # print(love)
 
    # 评论数
    commit_num = soup.find(name='a', attrs={"class": "comment-open"}).text
    # print(commit_num)
 
    # 小编点评
    commit_content = soup.find(name='div', attrs={"class": "con"}).text
    # print(commit_content)
 
    # app下载链接
    download_url = soup.find(name='a', attrs={"class": "normal-dl-btn"}).attrs['href']
    # print(download_url)
 
    print(
        f'''
        ============= tank ==============
        app名称:{name}
        好评率: {love}
        评论数: {commit_num}
        小编点评: {commit_content}
        app下载链接: {download_url}
        ============= end ==============
        '''
    )
 
 
 
# 解析主页
def parse_index(data):
    soup = BeautifulSoup(data, 'lxml')
 
    # 获取所有app的li标签
    app_list = soup.find_all(name='li', attrs={"class": "card"})
    for app in app_list:
        # print(app)
        # print('tank' * 1000)
        # print('tank *' * 1000)
        # print(app)
        # 图标地址
        # 获取第一个img标签中的data-original属性
        img = app.find(name='img').attrs['data-original']
        print(img)
 
        # 下载次数
        # 获取class为install-count的span标签中的文本
        down_num = app.find(name='span', attrs={"class": "install-count"}).text
        print(down_num)
 
        import re
        # 大小
        # 根据文本正则获取到文本中包含 数字 + MB(\d+代表数字)的span标签中的文本
        size = soup.find(name='span', text=re.compile("\d+MB")).text
        print(size)
 
        # 详情页地址
        # 获取class为detail-check-btn的a标签中的href属性
        # detail_url = soup.find(name='a', attrs={"class": "name"}).attrs['href']
        # print(detail_url)
 
        # 详情页地址
        detail_url = app.find(name='a').attrs['href']
        print(detail_url)
 
        # 3、往app详情页发送请求
        response = get_page(detail_url)
 
        # 4、解析app详情页
        parse_detail(response.text)
 
 
def main():
    for line in range(1, 33):
        url = f"https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=FRsWKgWBqMBZLdxLaK4iem9B"
 
        # 1、往app接口发送请求
        response = get_page(url)
        # print(response.text)
        print('*' * 1000)
        # 反序列化为字典
        data = response.json()
 
        # 获取接口中app标签数据
        app_li = data['data']['content']
        # print(app_li)
        # 2、解析app标签数据
        parse_index(app_li)
 
 
if __name__ == '__main__':
    main()

5.mongoDB的简单使用:

MongoDB 非关系型数据库
一 安装与使用
1、下载安装
https://www.mongodb.com/download-center/community

2、在C盘创建一个data/db文件夹
- 数据的存放路径

3、mongod启动服务
进入终端,输入mongod启动mongoDB服务。

4、mongo进入mongoDB客户端
打开一个新的终端,输入mongo进入客户端

二 数据库操作

数据库操作:
切换库:
SQL:
use admin; 有则切换,无则报错。

MongoDB:
use tank; 有则切换,无则创建,并切换tank库中。

查数据库:
SQL:
show databases;

MongoDB:
show dbs;
显示的数据库若无数据,则不显示。

删除库:
SQL:
drop database

MongoDB:
db.dropDatabase()


集合操作: MySQL中叫做表。
创建集合:
SQL:
create table f1, f2...

MongoDB:
# 在当前库中通过.来创建集合
db.student

插入数据:
# 插入多条数据
db.student.insert([{"name1": "tank1"}, {"name2": "tank2"}])

# 插入一条
db.student.insert({"name": "tank"})


查数据:
# 查找student集合中所有数据
db.student.find({})

# 查一条 查找name为tank的记录
db.student.find({"name":"tank"})

三 python链接MongoDB
1、下载第三方模块pymongo
pip3 install pymongo

2、链接mongoDB客户端
client = MongoClient('localhost', 27017)

6.pymongo简单使用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from pymongo import MongoClient
 
# 1、链接mongoDB客户端
# 参数1: mongoDB的ip地址
# 参数2: mongoDB的端口号 默认:27017
client = MongoClient('localhost', 27017)
# print(client)
 
# 2、进入tank_db库,没有则创建
# print(client['tank_db'])
 
# 3、创建集合
# print(client['tank_db']['people'])
 
# 4、给tank_db库插入数据
 
# 1.插入一条
data1 = {
    'name': 'tank',
    'age': 18,
    'sex': 'male'
}
client['tank_db']['people'].insert(data1)
 
# 2.插入多条
data1 = {
    'name': 'tank',
    'age': 18,
    'sex': 'male'
}
data2 = {
    'name': '戚志云',
    'age': 84,
    'sex': 'female'
}
data3 = {
    'name': '沈金金',
    'age': 73,
    'sex': 'male'
}
client['tank_db']['people'].insert([data1, data2, data3])
#
# # 5、查数据
# # 查看所有数据
data_s = client['tank_db']['people'].find()
print(data_s)  # <pymongo.cursor.Cursor object at 0x000002EEA6720128>
# # 需要循环打印所有数据
for data in data_s:
    print(data)
 
# # 查看一条数据
data = client['tank_db']['people'].find_one()
print(data)
 
# 官方推荐使用
# 插入一条insert_one
client['tank_db']['people'].insert_one()
# 插入多条insert_many
client['tank_db']['people'].insert_many()

二.作业:

1、整理课堂内容,并写博客

2、基于豌豆荚爬取剩下的简介截图图片地址、网友评论

3、把豌豆荚爬取的数据插入mongoDB中
- 创建一个wandoujia库
- 把主页的数据存放一个名为index集合中
- 把详情页的数据存放一个名为detail集合中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
'''
主页:
    图标地址、下载次数、大小、详情页地址
 
详情页:
    游戏名、好评率、评论数、小编点评、下载地址、简介、网友评论、1-5张截图链接地址、
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
 
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
 
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
 
32
'''
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
'''
3、把豌豆荚爬取的数据插入mongoDB中
    - 创建一个wandoujia库
        - 把主页的数据存放一个名为index集合中
        - 把详情页的数据存放一个名为detail集合中
'''
# 连接MongoDB客户端
client = MongoClient('localhost', 27017)
# 创建或选择wandoujia库,index集合
index_col = client['wandoujia']['index']
# 创建或选择wandoujia库,detail集合
detail_col = client['wandoujia']['detail']
 
# 1、发送请求
def get_page(url):
    response = requests.get(url)
    return response
 
 
# 2、开始解析
# 解析详情页
def parse_detail(text):
 
    soup = BeautifulSoup(text, 'lxml')
    # print(soup)
 
    # app名称
    try:
        name = soup.find(name="span", attrs={"class": "title"}).text
    except Exception:
        # 若有异常,设置为None
        name = None
    # print(name)
 
    # 好评率
    try:
        love = soup.find(name='span', attrs={"class": "love"}).text
 
    except Exception:
        love = None
    # print(love)
 
    # 评论数
    try:
        commit_num = soup.find(name='a', attrs={"class": "comment-open"}).text
    except Exception:
        commit_num = None
    # print(commit_num)
 
    # 小编点评
    try:
        commit_content = soup.find(name='div', attrs={"class": "con"}).text
    except Exception:
        commit_content = None
    # print(commit_content)
 
    # app下载链接
 
    try:
        download_url = soup.find(name='a', attrs={"class": "normal-dl-btn"}).attrs['href']
    except Exception:
        # 若有异常,设置为None
        download_url = None
 
    # print(download_url)
 
    # print(
    #     f'''
    #     ============= tank ==============
    #     app名称:{name}
    #     好评率: {love}
    #     评论数: {commit_num}
    #     小编点评: {commit_content}
    #     app下载链接: {download_url}
    #     ============= end ==============
    #     '''
    # )
 
    # 判断所有数据都存在,正常赋值
    if name and love and commit_num and commit_content and download_url :
        detail_data = {
            'name': name,
            'love': love,
            'commit_num': commit_num,
            'commit_content': commit_content,
            'download_url': download_url
        }
 
    # 若love没有值,则设置为 没人点赞,很惨
    if not love:
        detail_data = {
            'name': name,
            'love': "没人点赞,很惨",
            'commit_num': commit_num,
            'commit_content': commit_content,
            'download_url': download_url
        }
    # 若download_url没有值,则设置为 没有安装包
    if not download_url:
        detail_data = {
            'name': name,
            'love': love,
            'commit_num': commit_num,
            'commit_content': commit_content,
            'download_url': '没有安装包'
        }
 
 
 
    # 插入详情页数据
    detail_col.insert(detail_data)
    print(f'{name}app数据插入成功!')
 
# 解析主页
def parse_index(data):
    soup = BeautifulSoup(data, 'lxml')
 
    # 获取所有app的li标签
    app_list = soup.find_all(name='li', attrs={"class": "card"})
    for app in app_list:
        # print(app)
        # print('tank' * 1000)
        # print('tank *' * 1000)
        # print(app)
        # 图标地址
        # 获取第一个img标签中的data-original属性
        img = app.find(name='img').attrs['data-original']
        # print(img)
 
        # 下载次数
        # 获取class为install-count的span标签中的文本
        down_num = app.find(name='span', attrs={"class": "install-count"}).text
        # print(down_num)
 
        import re
        # 大小
        # 根据文本正则获取到文本中包含 数字 + MB(\d+代表数字)的span标签中的文本
        size = soup.find(name='span', text=re.compile("\d+MB")).text
        # print(size)
 
        # 详情页地址
        # 获取class为detail-check-btn的a标签中的href属性
        # detail_url = soup.find(name='a', attrs={"class": "name"}).attrs['href']
        # print(detail_url)
 
        # 详情页地址
        detail_url = app.find(name='a').attrs['href']
        # print(detail_url)
 
        # 拼接数据
        index_data = {
            'img': img,
            'down_num': down_num,
            'size': size,
            'detail_url': detail_url
        }
 
        # 插入数据
        index_col.insert(index_data)
        print('主页数据插入成功!')
 
        # 3、往app详情页发送请求
        response = get_page(detail_url)
 
        # 4、解析app详情页
        parse_detail(response.text)
 
 
def main():
    for line in range(1, 33):
        url = f"https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=FRsWKgWBqMBZLdxLaK4iem9B"
 
        # 1、往app接口发送请求
        response = get_page(url)
        # print(response.text)
        print('*' * 1000)
        # 反序列化为字典
        data = response.json()
 
        # 获取接口中app标签数据
        app_li = data['data']['content']
        # print(app_li)
 
        # 2、解析app标签数据
        parse_index(app_li)
 
        # 执行完所有函数关闭mongoDB客户端
        client.close()
 
if __name__ == '__main__':
    main()

posted on 2019-06-21 16:09  肥嘟嘟左卫门!  阅读(80)  评论(0编辑  收藏  举报

导航