单线程、多线程、多进程、协程比较,以爬取新浪军事历史为例
演示python单线程、多线程、多进程、协程
1 import requests,json,random 2 import re,threading,time 3 from lxml import etree 4 5 lock=threading.Lock() 6 semaphore=threading.Semaphore(100) ###每次限制只能100线程 7 8 user_agent_list = [ \ 9 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" ,\ 10 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ 11 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ 12 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ 13 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ 14 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ 15 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ 16 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 17 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 18 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 19 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ 20 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ 21 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 22 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 23 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 24 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ 25 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ 26 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 27 ] 28 count=0 29 30 def sina(page_url): ##列表页 31 if semaphore.acquire(): 32 header={} 33 34 header['User-Agent']=random.choice(user_agent_list) 35 header.update({ 36 "Host":"platform.sina.com.cn", 37 38 #"Cookie":"global_cookie=fb1g6d0w64d2cmu86sv4g9n3va0j137sk48; vh_newhouse=3_1491312022_2816%5B%3A%7C%40%7C%3A%5D833300ee3177d88529c7aa418942ece9; newhouse_user_guid=2F163DE7-8201-7FA9-2FB6-E507FE6F03B1; SoufunSessionID_Esf=3_1495389730_232; sf_source=; s=; showAdsh=1; hlist_xfadhq_SZ=0%7c2017%2f5%2f25+1%3a21%3a47%7c; city=sz; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; logGuid=a768dd46-b85b-47f4-a7a0-0a6596cab4cd; __utma=147393320.1111837171.1491290389.1495646208.1495650134.9; __utmb=147393320.12.10.1495650134; __utmc=147393320; __utmz=147393320.1495650134.9.4.utmcsr=esf.sz.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/; unique_cookie=U_cqyov4ut5vv1al8e2858qhzgt17j2z06mph*14" 39 }) 40 while(1): 41 content='' 42 try: 43 content=requests.get(page_url,headers=header,timeout=5).content 44 45 except Exception as e: 46 print e 47 if content!='': 48 break 49 50 51 52 53 jsona=re.findall('jQuery191012358189839869738_1495880348059\(([\s\S]*?"}]}})',content)[0] 54 #print jsona 55 dict= json.loads(jsona) 56 #print type(dict) 57 #print dict 58 #print dict['result']['data'] 59 for l in dict['result']['data']: 60 title= l['title'] 61 url= l['url'] 62 biaoqian=get_biaoqian(url) 63 64 lock.acquire() 65 global count 66 count+=1 67 print time.strftime('%H:%M:%S',time.localtime(time.time())),' ',count 68 print '列表页:' 69 70 print ' title: %s\n url: %s'%(title,url) 71 72 print '详情页:' 73 print ' biaoqian: %s \n'%(biaoqian) 74 print '**************************************************************' 75 lock.release() 76 77 semaphore.release() 78 79 80 81 def get_biaoqian(url): ###新闻页,爬取标签 82 83 header={'User-Agent':random.choice(user_agent_list)} 84 header.update({"Host":"mil.news.sina.com.cn"}) 85 86 while(1): 87 content='' 88 try: 89 content=requests.get(url,headers=header,timeout=10).content 90 except Exception as e: 91 #print e 92 pass 93 if content!='': 94 break 95 96 97 se=etree.HTML(content) 98 #print etree.tounicode(se) 99 biaoqian=se.xpath('//p[@class="art_keywords"]/a/text()') 100 return ' '.join(biaoqian) 101 102 103 104 105 def singe_req(): 106 for i in range(1,301): 107 page_url='http://platform.sina.com.cn/news/news_list?app_key=2872801998&channel=mil&cat_1=lishi&show_all=0&show_cat=1&show_ext=1&tag=1&format=json&page=%s&show_num=10&callback=jQuery191012358189839869738_1495880348059&_=1495880348069'%i 108 sina(page_url) 109 print 'over' 110 111 def threading_red(): 112 threads=[] 113 for i in range(1,301): 114 t=threading.Thread(target=sina,args=('http://platform.sina.com.cn/news/news_list?app_key=2872801998&channel=mil&cat_1=lishi&show_all=0&show_cat=1&show_ext=1&tag=1&format=json&page=%s&show_num=10&callback=jQuery191012358189839869738_1495880348059&_=1495880348069'%i,)) 115 threads.append(t) 116 t.start() 117 for t in threads: 118 t.join() 119 print 'over' 120 121 def muiltiprocessing_req(): 122 import multiprocessing 123 pool = multiprocessing.Pool(100) 124 #pool = multiprocessing.Pool(multiprocessing.cpu_count()) 125 126 pool.map(sina, ['http://platform.sina.com.cn/news/news_list?app_key=2872801998&channel=mil&cat_1=lishi&show_all=0&show_cat=1&show_ext=1&tag=1&format=json&page=%s&show_num=10&callback=jQuery191012358189839869738_1495880348059&_=1495880348069'%i for i in range(1,301)]) 127 pool.close() 128 pool.join() 129 print 'over' 130 131 def gevent_req(): 132 ######################利用pool###################### 133 from gevent import monkey 134 from gevent.pool import Pool 135 136 monkey.patch_all() 137 pool = Pool(100) 138 data= pool.map(sina, ['http://platform.sina.com.cn/news/news_list?app_key=2872801998&channel=mil&cat_1=lishi&show_all=0&show_cat=1&show_ext=1&tag=1&format=json&page=%s&show_num=10&callback=jQuery191012358189839869738_1495880348059&_=1495880348069'%i for i in range(1,301)]) 139 print 'over' 140 141 if __name__=='__main__': 142 pass 143 singe_req() ##单线程 144 #threading_red() ###多线程 145 #muiltiprocessing_req() ####多进程
146 #gevent_req() ##协程
这篇主要是用四种方法来实现爬虫。无论是100线程还是100进程或者100协程,网速都撑满了,爬取速度很快,单线程对网速利用很不充分,当然就爬取缓慢。
特别是我之前在面试房极客时候,那主管告诉我,他说他看了网上说python多线程是假的,所以他从来就没使用过多线程,只用多进程,他认为多线程不能加快爬虫速度。
关于这一点我是非常确定python多线程能加快爬取速度的,因为我使用多线程的时间很长,那主管应该只看了一半,python对cpu密集型速度提升不了多少,但对于io密集型的速度提升是立竿见影的,特别是对timeout比较大的网站,多线程爬取优势非常明显,因为爬虫是打开页面,请求服务器后端,服务器后端操作数据库查询数据,数据库返回给后端返回给前段,这种属于io密集型,多线程在爬虫和性能测试都是可以的。而多进程实在是开销太大了,开100进程,任务管理器可以看到100个python.exe,每个占用20M内存,多进程启动时候占用cpu极高。爬虫是非常适合多线程的,或者利用协程也可以。
发下运行结果:
反对极端面向过程编程思维方式,喜欢面向对象和设计模式的解读,喜欢对比极端面向过程编程和oop编程消耗代码代码行数的区别和原因。致力于使用oop和36种设计模式写出最高可复用的框架级代码和使用最少的代码行数完成任务,致力于使用oop和设计模式来使部分代码减少90%行,使绝大部分py文件最低减少50%-80%行的写法。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· winform 绘制太阳,地球,月球 运作规律
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 上周热点回顾(3.3-3.9)
· AI 智能体引爆开源社区「GitHub 热点速览」