多进程实例——爬取百度贴吧
上次介绍了多进程并发相关内容,本次以爬取百度贴吧为例,进行实战演示。
爬去的网址:http://tieba.baidu.com/p/3522395718
本次爬去每层楼的发帖人、发帖内容和发帖时间。
闲话不说直接上代码
1 # -*- coding: utf-8 -*- 2 import requests 3 from bs4 import BeautifulSoup 4 import json 5 from multiprocessing import Pool 6 import time 7 8 urls = ['http://tieba.baidu.com/p/3522395718?pn={}'.format(i) for i in range(1,20)] 9 10 def return_infos(url): 11 html = requests.get(url) 12 soup = BeautifulSoup(html.content,'lxml') 13 items = soup.select('div.p_postlist div.l_post.j_l_post.l_post_bright') 14 contents = soup.select('div.p_postlist div.d_post_content.j_d_post_content.clearfix') 15 names = soup.select('div.p_postlist li.d_name a') 16 for i,j,k in zip(items,contents,names): 17 item = json.loads(i.get('data-field')) 18 date = item['content']['date'] 19 content = j.text.strip() 20 name = k.text.strip() 21 print(name,date,content) 22 23 if __name__ == '__main__': 24 t0 = time.time() 25 for ix in urls: 26 return_infos(ix) 27 t1 = time.time() 28 29 pool = Pool(4) 30 pool.map(return_infos,urls) 31 pool.close() # 关闭进程池,不再接受新的进程 32 pool.join() # 主进程阻塞等待子进程的退出 33 t2 = time.time() 34 print("正常执行的时间:", (t1 - t0)) 35 print("并行执行时间:", (t2 - t1))
1 # -*- coding: utf-8 -*- 2 import requests 3 from bs4 import BeautifulSoup 4 import json 5 from multiprocessing import Pool 6 import time 7 8 urls = ['http://tieba.baidu.com/p/3522395718?pn={}'.format(i) for i in range(1,20)] 9 10 def return_infos(url): 11 html = requests.get(url) 12 soup = BeautifulSoup(html.content,'lxml') 13 items = soup.select('div.p_postlist div.l_post.j_l_post.l_post_bright') 14 contents = soup.select('div.p_postlist div.d_post_content.j_d_post_content.clearfix') 15 names = soup.select('div.p_postlist li.d_name a') 16 for i,j,k in zip(items,contents,names): 17 item = json.loads(i.get('data-field')) 18 date = item['content']['date'] 19 content = j.text.strip() 20 name = k.text.strip() 21 print(name,date,content) 22 23 if __name__ == '__main__': 24 t0 = time.time() 25 for ix in urls: 26 return_infos(ix) 27 t1 = time.time() 28 29 pool = Pool(4) 30 pool.map(return_infos,urls) 31 pool.close() # 关闭进程池,不再接受新的进程 32 pool.join() # 主进程阻塞等待子进程的退出 33 t2 = time.time() 34 print("正常执行的时间:", (t1 - t0)) 35 print("并行执行时间:", (t2 - t1))
爬取结果:
1 正常执行的时间: 16.037917375564575 2 并行执行时间: 6.655380487442017