Python 爬虫(2)多线程
前面说过由于GIL的存在,Python的多线程效率没有希望的那么高,python的多线程适合IO密集型的情况,而爬虫恰好就是一个IO密集的情况,因为爬虫中很大一部分时间,是在等待socket返回数据。
下面写一个例子:
1 2 3 4 5 6 7 8 9 10 11 | import requests import time if __name__ = = '__main__' : codes = [ 'sh600993' , 'sh000006' , 'sh600658' , 'sh600153' , 'sh600005' ] start = time.time() for code in codes: url = 'http://hq.sinajs.cn/list=' + code response = requests.get(url).text print response print time.time() - start |
1 2 3 4 5 6 7 8 9 10 11 | var hq_str_sh600993 = "马应龙,20.020,20.090,20.060,20.060,19.950,20.040,20.060,486809,9740634.000,2100,20.040,8300,20.030,1300,20.020,2300,20.010,4100,20.000,101,20.060,10000,20.070,14400,20.080,19000,20.090,25700,20.100,2017-01-24,11:30:00,00" ; var hq_str_sh000006 = "地产指数,6567.8364,6574.1060,6568.6375,6577.7249,6542.6599,0,0,1486830,1392918131,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2017-01-24,11:35:51,00" ; var hq_str_sh600658 = "电子城,13.320,13.200,13.270,13.320,13.040,13.270,13.280,559733,7389992.000,30800,13.270,300,13.220,6200,13.200,2500,13.100,4900,13.090,9300,13.280,6400,13.290,8200,13.300,6900,13.310,9000,13.320,2017-01-24,11:30:00,00" ; var hq_str_sh600153 = "建发股份,10.520,10.510,10.500,10.540,10.460,10.490,10.500,4834159,50730040.000,32800,10.490,60100,10.480,186000,10.470,181241,10.460,125800,10.450,56600,10.500,105500,10.510,108400,10.520,110400,10.530,139900,10.540,2017-01-24,11:30:00,00" ; var hq_str_sh600005 = "武钢股份,0.000,3.710,3.710,0.000,0.000,0.000,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,2017-01-24,11:30:00,03" ; 0.110999822617 |
换成多线程之后:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | import requests import threading import time def get_stock(code): url = 'http://hq.sinajs.cn/list=' + code response = requests.get(url).text # js_info = response.read() print response if __name__ = = '__main__' : codes = [ 'sh600993' , 'sh000006' , 'sh600658' , 'sh600153' , 'sh600005' ] start = time.time() threads = [threading.Thread(target = get_stock,args = (code,)) for code in codes] for t in threads: t.start() for t in threads: t.join() print time.time() - start |
1 2 3 4 5 6 7 8 9 10 11 | var hq_str_sh600993 = "马应龙,20.020,20.090,20.060,20.060,19.950,20.040,20.060,486809,9740634.000,2100,20.040,8300,20.030,1300,20.020,2300,20.010,4100,20.000,101,20.060,10000,20.070,14400,20.080,19000,20.090,25700,20.100,2017-01-24,11:30:00,00" ; var hq_str_sh600658 = "电子城,13.320,13.200,13.270,13.320,13.040,13.270,13.280,559733,7389992.000,30800,13.270,300,13.220,6200,13.200,2500,13.100,4900,13.090,9300,13.280,6400,13.290,8200,13.300,6900,13.310,9000,13.320,2017-01-24,11:30:00,00" ; var hq_str_sh000006 = "地产指数,6567.8364,6574.1060,6568.6375,6577.7249,6542.6599,0,0,1486830,1392918131,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2017-01-24,11:35:51,00" ; var hq_str_sh600153 = "建发股份,10.520,10.510,10.500,10.540,10.460,10.490,10.500,4834159,50730040.000,32800,10.490,60100,10.480,186000,10.470,181241,10.460,125800,10.450,56600,10.500,105500,10.510,108400,10.520,110400,10.530,139900,10.540,2017-01-24,11:30:00,00" ; var hq_str_sh600005 = "武钢股份,0.000,3.710,3.710,0.000,0.000,0.000,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,2017-01-24,11:30:00,03" ; 0.0379998683929 |
速度有了很大的提升
线程池
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | import requests import threadpool import time def get_stock(code): url = 'http://hq.sinajs.cn/list=' + code response = requests.get(url).text # js_info = response.read() print response if __name__ = = '__main__' : codes = [ 'sh600993' , 'sh000006' , 'sh600658' , 'sh600153' , 'sh600005' ] start = time.time() pool = threadpool.ThreadPool( 5 ) tasks = threadpool.makeRequests(get_stock,codes) [pool.putRequest(task) for task in tasks] pool.wait() print time.time() - start |
threadpool.ThreadPool定义了一个线程池,表示可以创建4个线程;
makeRequests创建了要开启多线程的函数,已经函数的参数以及回调函数,回调函数callback可以不写,默认是无。
1 | [pool.putRequest(task) for task in tasks]是将所有多线程的请求扔进了线程池,等价于 |
1 2 | for code in codes: pool.putRequest(code) |
pool.wait()是等待所有工作完成后退出。这里执行的数量还比较少,基本的多线程就够用了,当数量多了起来之后,线程池的效果会好一些。
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步