多线程爬虫
应用场景
1、多进程 :CPU密集程序
2、多线程 :爬虫(网络I/O)、本地磁盘I/O
队列
# 导入模块 from queue import Queue # 使用 q = Queue() q.put(url) q.get() # 当队列为空时,阻塞 q.empty() # 判断队列是否为空,True/False
线程模块
# 导入模块 from threading import Thread # 使用流程 t = Thread(target=函数名) # 创建线程对象 t.start() # 创建并启动线程 t.join() # 阻塞等待回收线程 #如何创建多线程 t_list=[] for i in range(5): t = Thread(target=函数名) t_list.append(t) t.start() for t in t_list: t.join()
小米应用商店抓取(多线程)
目标
1、网址 :百度搜 - 小米应用商店,进入官网 2、目标 :应用分类 - 聊天社交 应用名称 应用链接
实现步骤
1.确认是否为动态加载
1、页面局部刷新 2、右键查看网页源代码,搜索关键字未搜到 # 此网站为动态加载网站,需要抓取网络数据包分析
2.F12抓取网络数据包
1、抓取返回json数据的URL地址(Headers中的Request URL) http://app.mi.com/categotyAllListApi?page={}&categoryId=2&pageSize=30 2、查看并分析查询参数(headers中的Query String Parameters) page: 1 categoryId: 2 pageSize: 30 # 只有page再变,0 1 2 3 ... ... ,这样我们就可以通过控制page的直拼接多个返回json数据的URL地址
3.代码实现
import requests from threading import Thread from queue import Queue import time from fake_useragent import UserAgent from lxml import etree class XiaomiSpider(object): def __init__(self): self.url = 'http://app.mi.com/categotyAllListApi?page={}&categoryId={}&pageSize=30' # 存放所有URL地址的队列 self.q = Queue() self.ua = UserAgent() self.i = 0 # 存放所有类型id的空列表 self.id_list = [] def get_cateid(self): # 请求 url = 'http://app.mi.com/' headers = { 'User-Agent':self.ua.random } html = requests.get(url=url,headers=headers).text # 解析 parse_html = etree.HTML(html) xpath_bds = '//ul[@class="category-list"]/li' li_list = parse_html.xpath(xpath_bds) for li in li_list: typ_name = li.xpath('./a/text()')[0] typ_id = li.xpath('./a/@href')[0].split('/')[-1] # 计算每个类型的页数 pages = self.get_pages(typ_id) self.id_list.append( (typ_id,pages) ) # 入队列 self.url_in() # 获取counts的值并计算页数 def get_pages(self,typ_id): # 每页返回的json数据中,都有count这个key url = self.url.format(0,typ_id) html = requests.get( url=url, headers={'User-Agent':self.ua.random} ).json() count = html['count'] pages = int(count) // 30 + 1 return pages # url入队列 def url_in(self): for id in self.id_list: # id为元组,('2',pages) for page in range(id[1]): url = self.url.format(page,id[0]) print(url) # 把URL地址入队列 self.q.put(url) # 线程事件函数: get() - 请求 - 解析 - 处理数据 def get_data(self): while True: if not self.q.empty(): url = self.q.get() headers = {'User-Agent':self.ua.random} html = requests.get(url=url,headers=headers).json() self.parse_html(html) else: break # 解析函数 def parse_html(self,html): for app in html['data']: # 应用名称 name = app['displayName'] link = 'http://app.mi.com/details?id=' + app['packageName'] print(name,link) self.i += 1 # 主函数 def main(self): # URL入队列 self.get_cateid() t_list = [] # 创建多个线程 for i in range(1): t = Thread(target=self.get_data) t_list.append(t) t.start() # 回收线程 for t in t_list: t.join() print('数量:',self.i) if __name__ == '__main__': start = time.time() spider = XiaomiSpider() spider.main() end = time.time() print('执行时间:%.2f' % (end-start))
腾讯招聘数据抓取
1.确定url地址及目标
1、URL: 百度搜索腾讯招聘 - 查看工作岗位
2、目标: 职位名称、工作职责、岗位要求
2.要求与分析
1、通过查看网页源码,得知所需数据均为 Ajax 动态加载 2、通过F12抓取网络数据包,进行分析 3、一级页面抓取数据: 职位名称 4、二级页面抓取数据: 工作职责、岗位要求
3.一级页面json地址(index在变,timestamp未检查)
https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1563912271089&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn
4.二级页面地址(postId在变,在一级页面中可拿到)
https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1563912374645&postId={}&language=zh-cn
5.代码实现
import requests import json import time import random from fake_useragent import UserAgent class TencentSpider(object): def __init__(self): self.one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1563912271089&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn' self.two_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1563912374645&postId={}&language=zh-cn' # 获取User-Agent def get_headers(self): ua = UserAgent() headers = { 'User-Agent': ua.random } return headers # 获取响应内容函数 def get_page(self,url): html = requests.get(url=url,headers=self.get_headers()).content.decode('utf-8','ignore') # json.loads()把json格式的字符串转为python数据类型 html = json.loads(html) return html # 主线函数: 获取所有数据 def parse_page(self,one_url): html = self.get_page(one_url) item = {} for job in html['Data']['Posts']: item['name'] = job['RecruitPostName'] item['address'] = job['LocationName'] # 拿postid为了拼接二级页面地址 post_id = job['PostId'] # 职责和要求(二级页面) two_url = self.two_url.format(post_id) item['duty'],item['requirement'] = self.parse_two_page(two_url) print(item) def parse_two_page(self,two_url): html = self.get_page(two_url) # 职责 + 要求 duty = html['Data']['Responsibility'] requirement = html['Data']['Requirement'] return duty,requirement # 获取总页数 def get_pages(self): url = self.one_url.format(1) html = self.get_page(url) pages = int(html['Data']['Count']) // 10 + 1 return pages def main(self): # 总页数 pages = self.get_pages() for index in range(1,pages): one_url = self.one_url.format(index) self.parse_page(one_url) time.sleep(random.uniform(0.5,1.5)) if __name__ == '__main__': start = time.time() spider = TencentSpider() spider.main() end = time.time() print('执行时间:%.2f' % (end-start))
6.多线程思路:
把所有一级页面链接提交到队列,进行多线程数据抓取
7.多线程代码实现
import requests import json import time import random from fake_useragent import UserAgent from threading import Thread from queue import Queue class TencentSpider(object): def __init__(self): self.one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1563912271089&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn' self.two_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1563912374645&postId={}&language=zh-cn' self.q = Queue() # 计数 self.i = 0 # 获取User-Agent def get_headers(self): ua = UserAgent() headers = { 'User-Agent': ua.random } return headers # 获取响应内容函数 def get_page(self,url): html = requests.get(url=url,headers=self.get_headers()).content.decode('utf-8','ignore') # json.loads()把json格式的字符串转为python数据类型 html = json.loads(html) return html # 主线函数: 获取所有数据 def parse_page(self): while True: if not self.q.empty(): one_url = self.q.get() html = self.get_page(one_url) item = {} for job in html['Data']['Posts']: item['name'] = job['RecruitPostName'] item['address'] = job['LocationName'] # 拿postid为了拼接二级页面地址 post_id = job['PostId'] # 职责和要求(二级页面) two_url = self.two_url.format(post_id) item['duty'],item['requirement'] = self.parse_two_page(two_url) print(item) self.i += 1 # 每爬取按完成1页随机休眠 time.sleep(random.uniform(0,1)) else: break def parse_two_page(self,two_url): html = self.get_page(two_url) # 职责 + 要求 duty = html['Data']['Responsibility'] requirement = html['Data']['Requirement'] return duty,requirement # 获取总页数 def get_pages(self): url = self.one_url.format(1) html = self.get_page(url) pages = int(html['Data']['Count']) // 10 + 1 return pages def main(self): # one_url入队列 pages = self.get_pages() for index in range(1,pages): one_url = self.one_url.format(index) self.q.put(one_url) t_list = [] for i in range(5): t = Thread(target=self.parse_page) t_list.append(t) t.start() for t in t_list: t.join() print('数量:',self.i) if __name__ == '__main__': start = time.time() spider = TencentSpider() spider.main() end = time.time() print('执行时间:%.2f' % (end-start))