Python爬虫之小米应用商店

小米应用商店的爬虫,提取各个App的下载链接。

源码:

 1 # -*- coding: UTF-8 -*-
 2 import requests
 3 import csv
 4 import queue
 5 
 6 
 7 class XiaoMiShop():
 8 
 9     def __init__(self, category):
10         self.base_url = 'http://app.mi.com/categotyAllListApi'
11         self.base_download = 'http://app.mi.com/download/'
12         self.header = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
13         self.csv_header = ['ID', '应用名称', '下载链接']
14         self.max_page = 70
15         self.category = category
16         self.queue = queue.Queue()
17 
18     def clean_data(self, data):
19         '''
20         提取数据,放入队列中
21         :param data:
22         :return:
23         '''
24         for i in data:
25             app = {}
26             app['ID'] = i.get('appId')
27             app['应用名称'] = i.get('displayName')
28             app['下载链接'] = self.base_download + str(i.get('appId'))
29             self.queue.put(app)
30 
31     def request(self, page):
32         parame = {
33             'page': page,
34             'categoryId': int(self.category),
35             'pageSize': 30
36         }
37         req = requests.get(url=self.base_url, params=parame)
38         req.encoding = req.apparent_encoding
39         return req
40 
41     def spider_by_page(self, page, retry=3):
42         '''
43         失败页数重新爬取
44         :param page: 失败页数
45         :param retry: 重试次数
46         :return:
47         '''
48         if retry > 0:
49             print('重试第{}页'.format(page))
50             req = self.request(page=page)
51             try:
52                 data = req.json()['data']
53                 if data:
54                     self.clean_data(data)
55                     print('第{}页重试成功'.format(page))
56             except:
57                 self.spider_by_page(page=page, retry=retry - 1)
58 
59     def spider(self):
60         '''
61         :param category: 模块id,如游戏:15
62         :return:
63         '''
64         fail_page = []
65         for _ in range(self.max_page):
66             print('正在爬取第{}页'.format(_))
67             req = self.request(_)
68             try:
69                 data = req.json()['data']
70             except:
71                 data = []
72                 fail_page.append(_)
73             if data:
74                 self.clean_data(data)
75             else:
76                 continue
77         if fail_page:
78             print('出错的页数:', fail_page)
79             for _ in fail_page:
80                 self.spider_by_page(page=_)
81         else:
82             print('没有出错')
83 
84     def run(self):
85         self.spider()
86         data_list = []
87         while not self.queue.empty():
88             data_list.append(self.queue.get())
89         with open('{}.csv'.format(self.category), 'w', newline='', encoding='utf-8-sig') as f:
90             f_csv = csv.DictWriter(f, self.csv_header)
91             f_csv.writeheader()
92             f_csv.writerows(data_list)
93         print('文件保存成功,打开{}.csv查看'.format(self.category))
94 
95 
96 if __name__ == '__main__':
97     a = XiaoMiShop(15).run()

 

爬取结果:
在这里插入图片描述
在这里插入图片描述

 

posted @ 2021-03-01 10:58  BugMiaowu2021  阅读(316)  评论(0编辑  收藏  举报