requests+bs4爬取豌豆荚排行榜及下载排行榜app

爬取排行榜应用信息

爬取豌豆荚排行榜app信息
    - app_detail_url - 应用详情页url
    - app_image_url - 应用图片url
    - app_name - 应用名称
    - app_install_count - 下载量
    - app_size - 应用大小
    - app_info - 应用简介

1.分析:
    - 目标url: https://www.wandoujia.com/top/app

    - 在网页中,发现了加载更多按钮,点击后,为异步请求,请求url为:
        https://www.wandoujia.com/wdjweb/api/top/more?resourceType=0&page=2&ctoken=kuoxO3QZz7JKIJtuA6RXibwL

    - 修改page的值,可以得到响应数据,则可以直接爬接口数据,page范围为1~41

2. 爬取数据过程
    - 发送请求
    - 解析数据
    - 保存数据到MySQL数据库

代码

# top_app.py
import requests
from bs4 import BeautifulSoup
from wandoujia.mysql_control import MySQL


# 请求函数
def get_data(url):
    response = requests.get(url)
    return response.json().get('data')


# 解析数据
def parse_data(json_data):
    data = json_data.get('content')
    # print(data)
    soup = BeautifulSoup(data, 'lxml')

    # 找出所以的li标签(每个app都在一个li标签里)
    li_list = soup.find_all(name='li', attrs={'class': 'card'})
    # print(li_list)

    for li in li_list:
        # 获取app详情页url
        app_detail_url = li.find(name='a').get('href')
        # print('应用详情页:', app_detail_url)

        # 获取app图片url
        img_tag = li.find(name='img')
        # print(img_tag)
        # app图片url
        app_image_url = img_tag.get('data-original')
        # print('应用图片:', app_image_url)
        # 应用名称
        app_name = img_tag.get('alt')
        # print('应用名称:', app_name)

        # 获取应用下载量
        app_install_count = li.find(name='span', attrs={'class': 'install-count'}).text
        # print('应用下载量:', app_install_count)

        # 获取应用大小
        try:
            app_size = li.find(name='span', attrs={'title': re.compile('MB')}).text
        except:
            app_size = ''
        # print('应用大小:', app_size)

        # 获取应用简介
        app_info = li.find(name='div', attrs={'class': 'comment'}).text.strip()
        # print('应用简介:', app_info)
        yield app_detail_url, app_image_url, app_name, app_install_count, app_size, app_info


# 保存数据
def save_data(generator_data, mysql_obj):
    for data in generator_data:
        # print(data)

        sql = 'insert into top_app(app_detail_url, app_image_url, app_name, app_install_count, app_size, app_info) ' \
              'values(%s, %s, %s, %s, %s, %s)'

        mysql_obj.execute(sql, data)

        print(f'{data[2]} 数据已爬取成功')
        print('*' * 100)


if __name__ == '__main__':
    # 实例化数据库对象
    mysql_obj = MySQL()

    # 拼接url
    for page in range(1, 42):
        url = f'https://www.wandoujia.com/wdjweb/api/top/more?resourceType=0&page={page}&ctoken=kuoxO3QZz7JKIJtuA6RXibwL'

        # 发送请求
        json_data = get_data(url)
        # 解析数据
        generator_data = parse_data(json_data)
        # 保存数据
        save_data(generator_data, mysql_obj)

MySQL数据库

# mysql.py
import pymysql


class MySQL:
    def __init__(self):
        self.client = pymysql.connect(
            host='127.0.0.1',
            port=3306,
            database='wandoujia',
            user='root',
            password='admin',
            charset='utf8',
            autocommit=True
        )

        self.cursor = self.client.cursor(pymysql.cursors.DictCursor)

    def execute(self, sql, args):
        try:
            self.cursor.execute(sql, args)
        except Exception as e:
            print(e)

    def close(self):
        self.cursor.close()
        self.client.close()

爬取详情页下载链接并下载

爬取豌豆荚排行榜app详情页

- 分析:
    - 目标url:在top_app文件中已有爬取的函数,可直接使用得到app_detail_url
    
    - 详情页面分析:
        - <div class="download-wp">下存在a标签<a class="normal-dl-btn">,href属性为下载链接

- 爬取数据过程:
	 - 使用上面top_app.py中的get_data(),和parse_data()可得到每个app的详情页
	 - 发送请求
	 - 解析数据
	 - 多线程下载app

代码

# top_app_detail.py
import os
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
from wandoujia import top_app


# 获取详情页数据
def get_detail_data(data):
    response = requests.get(data[0])
    # print(response.text)
    return response.text


# 解析数据
def parse_detail_data(response):
    soup = BeautifulSoup(response, 'lxml')
    app_download_url = soup.find(name='a', attrs={'class': 'normal-dl-btn'}).get('href')
    # print(f'应用名称: {data[2]}, 下载链接: {app_download_url}')

    return data[2], app_download_url


# 保存数据
def download_app(app_name_download_url_tuple):
    # 下载app
    app_name = os.path.join(save_dir, app_name_download_url_tuple[0])
    print(app_name_download_url_tuple[0], '开始下载')
    app_file = requests.get(app_name_download_url_tuple[1])
    # print(app_name)
    with open(app_name, 'wb') as f:
        for line in app_file.iter_lines():
            f.write(line)

    print(app_name_download_url_tuple[0], '下载完成')


if __name__ == '__main__':
    # app保存文件夹
    save_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'top_app')
    # 多线程下载,下载线程数为3
    pool = ThreadPoolExecutor(3)

    # 拼接url
    for page in range(1, 42):
        url = f'https://www.wandoujia.com/wdjweb/api/top/more?resourceType=0&page={page}&ctoken=kuoxO3QZz7JKIJtuA6RXibwL'

        # 获取详情页url
        # 发送请求
        json_data = top_app.get_data(url)
        # 解析数据
        generator_data = top_app.parse_data(json_data)

        # 爬取详情页
        for data in generator_data:
            # print(data)
            # 获取详情页数据
            detail_response = get_detail_data(data)
            # 解析详情页数据
            app_name_download_url_tuple = parse_detail_data(detail_response)

            # 单线程请求并下载app
            # download_app(app_name_download_url_tuple)
            
            # 使用多线程请求并下载app,下载线程数不宜过多,否则会很慢
            pool.submit(download_app, app_name_download_url_tuple)
posted @ 2019-12-31 20:55  油饼er  阅读(231)  评论(0编辑  收藏  举报