使用多进程请求多个url来减少网络等待浪费的时间
code
from multiprocessing import Pool import requests import json import os def get_page(url): print('<进程%s> get %s' %(os.getpid(),url)) respone=requests.get(url) if respone.status_code == 200: return {'url':url,'text':respone.text} def pasrse_page(res): print('<进程%s> parse %s' %(os.getpid(),res['url'])) parse_res='url:<%s> size:[%s]\n' %(res['url'],len(res['text'])) with open('db.txt','a') as f: f.write(parse_res) if __name__ == '__main__': urls=[ 'https://www.baidu.com', 'https://www.python.org', 'https://www.openstack.org', 'https://help.github.com/', 'http://www.sina.com.cn/' ] p=Pool(3) res_l=[] for url in urls: res=p.apply_async(get_page,args=(url,),callback=pasrse_page) res_l.append(res) p.close() p.join() #print([res.get() for res in res_l]) #拿到的是get_page的结果,其实完全没必要拿该结果,该结果已经传给回调函数处理了
Outputs
macname@MacdeMacBook-Pro py % python3 cccccc.py <进程61068> get https://www.baidu.com <进程61069> get https://www.python.org <进程61070> get https://www.openstack.org <进程61068> get https://help.github.com/ <进程61067> parse https://www.baidu.com <进程61069> get http://www.sina.com.cn/ <进程61067> parse https://www.python.org <进程61067> parse http://www.sina.com.cn/ <进程61067> parse https://help.github.com/ <进程61067> parse https://www.openstack.org macname@MacdeMacBook-Pro py %