import requests
from bs4 import BeautifulSoup
import re
from mysql_control import MySQL
# 爬虫三部曲
# 1.发送请求
def get_html(url):
response = requests.get(url)
return response
# 2.解析数据
def parse_data(string):
soup = BeautifulSoup(string, 'lxml')
# 获取所有的li标签,li中包含所有想要的数据
li_list = soup.find_all(name='li')
for li in li_list:
# print(type(li))
# 详情页url
app_detail_url = li.find(name='a').attrs.get('href')
print('详情页url: ', app_detail_url)
# 图标url
img_attrs = li.find(name='img').attrs
# print(img_attrs)
app_img = img_attrs.get('data-original')
print('图标url: ', app_img)
# 名字
app_name = img_attrs.get('alt')
print('名字: ', app_name)
# 下载量
app_download_num = li.find(name='span', attrs={'class': 'install-count'}).text
print('下载量: ', app_download_num)
# 大小
try:
# 有可能匹配规则写错的,然后获取不到text文本
app_size = li.find(name='span', attrs={'title': re.compile('MB')}).text
print('大小:', app_size)
except Exception as e:
# 放弃匹配规则不一样的数据,默认为空字符串
app_size = ''
# 简介
app_comment = li.find(name='div', attrs={'class': 'comment'}).text
print('简介: ', app_comment)
yield app_detail_url, app_img, app_name, app_download_num, app_size, app_comment
# 3.保存数据(将数据保存到数据库中)
def save_data(generator_data, mysql_obj):
for data in generator_data:
print(data)
sql = 'insert into app_data(app_detail_url, app_img, app_name, app_download_num, app_size, app_comment) values(%s, %s, %s, %s, %s, %s)'
print(sql)
mysql_obj.execute(sql, data)
if __name__ == '__main__':
mysql_obj = MySQL()
# 1.获取所有app的接口url
for line in range(1, 42):
url = f'https://www.wandoujia.com/wdjweb/api/top/more?resourceType=0&page={line}&ctoken=XrHfF5E1-zYoPHDD-yH8uNFE'
# 获取响应数据
response = get_html(url)
# print(response.status_code)
# 1.导入json模块
# 2.response.json()
# print(response.text)
# print(type(response.json()))
# 将json数据转成字典
json_dict_data = response.json()
# 获取字典中data的值中的content的值
li_str = json_dict_data.get('data').get('content')
# print(li_str)
# print(type(li_str))
generator_data = parse_data(li_str)
# 保存数据到数据库中
save_data(generator_data, mysql_obj)
mysql_obj.close()