一个简单的Python网络爬虫(抓图),针对某论坛.
1 #coding:utf-8 2 import urllib2 3 import re 4 import threading 5 6 #图片下载 7 def loadImg(addr,x,y,artName): 8 data = urllib2.urlopen(addr).read() 9 f = open(artName.decode("utf-8")+str(y)+'.jpg', 'wb') 10 f.write(data) 11 f.close() 12 13 #具体帖子页面解析,得到图片链接地址,并使用loadImg下载 artName为帖子名 14 def getImgLink(html,x,artName): 15 relink = '<img src=".*" file="(.*)" width=".*" id=".*" alt=".*.jpg" />' 16 cinfo = re.findall(relink,html) 17 y = 0 18 for lin in cinfo: 19 imgAddr = 'http://www.xxx.com/'+lin 20 print "LoadImg:"+str(x),imgAddr+'\n' 21 t = threading.Thread(target=loadImg(imgAddr,x,y,artName)) #使用threading 多线程下载 22 t.start() 23 y = y+1 24 25 #论坛版块页面解析,得到具体帖子链接 26 def getArticleLink(html,page): 27 relink = '<a href="(viewthread\.php\?tid=.*3D.*)">(.*)</a>' 28 cinfo = re.findall(relink,html) 29 x = 1 30 for lin in cinfo: 31 #print lin,'\n' 32 url="http://www.xxx.com/"+lin[0] 33 headers={"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"} 34 req = urllib2.Request(url,headers=headers) 35 response= urllib2.urlopen(req) 36 html = response.read() 37 getImgLink(html,x,lin[1]) 38 x = x+1 39 40 start = 1 #起始页 41 end = 100 #终止页 42 for page in range(end): 43 url="http://www.xxx.com/forumdisplay.php?fid=19&page="+str(page+start) 44 headers={"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"} 45 req = urllib2.Request(url,headers=headers) 46 response= urllib2.urlopen(req) 47 html = response.read() 48 print'Start' 49 getArticleLink(html,page)