Python3网络爬虫
# 最近在实验楼学习了爬取妹子图,发现在运行的时候不是很流畅,有些图片下 1 # coding: utf-8
1 # coding: utf-8 2 3 import re 4 import threading 5 from urllib.request import urlopen 6 from urllib.error import HTTPError 7 from bs4 import BeautifulSoup 8 import meizi_series_nextpage 9 10 def loadurl(url): 11 try: 12 conn = urlopen(url) 13 html = conn.read() 14 return html 15 except HTTPError as e: 16 return e 17 except Exception as e: 18 print("unkown exception in conn.read() %s "%e) 19 return '' 20 21 def meizi(url,path): 22 # 获取首页标签 23 print('start open meiziwang') 24 html = '' 25 while True: 26 html = loadurl(url) 27 if html == '': 28 print('load', url,'error') 29 continue 30 else: 31 break 32 mnvtp = BeautifulSoup(html) 33 taglists = mnvtp.findAll("div",{"class":"tags"}) 34 taglistss = re.findall('<a.*?href="(.*?)".*?>','%s'%taglists) 35 print(list(set(taglistss))) 36 print(len(list(set(taglistss)))) 37 print('open meiziwang over') 38 meizi_series_nextpage.nextpage(url,path) 39 threads = [] 40 for url in list(set(taglistss)): 41 t =threading.Thread(target=meizi_series_nextpage.nextpage, args=(url, path)) 42 threads.append(t) 43 for t in threads: 44 t.start() 45 for t in threads: 46 t.join() 47 if __name__ == '__main__': 48 meizi('http://www.meizitu.com','D:\\MeiZi\\') 49 print ('Spider Stop')
# coding: utf-8 import re from urllib.request import urlopen from urllib.error import HTTPError from bs4 import BeautifulSoup import meizi_series_getpage #同样的,这里是加载链接防超时 def loadurl(url): try: conn = urlopen(url, timeout=5) html = conn.read() return html except HTTPError as e: print(e) except Exception as e: print(e) def nextpage(url,path): #获取首页尾部标签 nextweibu = re.split("/",url) # 获取头部文件 nexthead = re.split("/a/",url) nexthead = nexthead[0] + "/a/" # 创建首页路径 path = path+"\\"+nextweibu[-1].split(".",1)[0] # 获取html while True: html = loadurl(url) if html == '': print('load', url,'error') continue else: break # 获取子标签 mnvtp = BeautifulSoup(html) taglists = mnvtp.findAll("div",{"id":"wp_page_numbers"}) taglists = re.findall('<a.*?href="(.*?)".*?>','%s'%taglists) taglists = sorted(list(set(taglists))) if taglists == []: taglists = [nextweibu[-1]] # 获取单个首页所有标签完整url路径 print("正在获取首页所有子标签Url:%s"%url) completeurl = [] for i in taglists: url = nexthead + i completeurl.append(url) completeurl = sorted(completeurl) for i in completeurl: print("正在获取子标签下所有套图url路径") meizi_series_getpage.tag_series(i,path)
View Code
# coding: utf-8 import time from urllib.request import urlopen from urllib.request import Request from urllib.error import HTTPError from urllib.request import urlretrieve import os import re from bs4 import BeautifulSoup import urllib from urllib import parse # 图片下载的主逻辑函数,获取图片链接,然后传给pic_list() def picurl(url,path): if os.path.exists(path): print(path,'目录已存在') else: print("正在创建目录:%s"%path) os.makedirs(path) # 获取套图url(图片)地址 html = '' while True: html = loadurl(url) if html == '': continue else: break rePicContent1 = '<div.*?id="picture.*?>.*?<p>(.*?)</p>' rePicContent2 = '<div.*?class="postContent.*?>.*?<p>(.*?)</p>' rePicList = '<img.*?src="(.*?)".*?>' #这里对re.S做个介绍,re.S是可以不添加的,加上之后,它的作用就是能忽略换行符,将两条作为一条来匹配。html代码碰上换行的概率是很高的,所以我一致采用re.S(下文有配图) picContent = re.findall(rePicContent1,"%s"%html,re.S) if len(picContent) <=0: picContent = re.findall(rePicContent2, "%s"%html,re.S) if len(picContent) <=0: print('无法匹配到对应的图片url') return False else: picList = re.findall(rePicList,"%s"%picContent[0],re.S) pic_list(picList,path) # #这个函数,相当于一个中介,我只是把for循环代码提出就得到了这个函数 def pic_list(picList,path): for picurl in picList: print("获取图片地址:%s"%picurl) save_pic(picurl,path) #保存图片的逻辑代码块 def save_pic(url,path): searchname = '.*/(.*?.jpg)' name = re.findall(searchname,url) filename = path +'\\'+ name[0] print(filename + ':start') #控制台显示信息 #定义了在下载图片时遇到错误的重试次数 tryTimes = 3 #当重试次数没有用完时,则尝试下载 while tryTimes != 0: tryTimes -= 1 if os.path.exists(filename): print(filename,'已存在,跳过') return True elif os.path.exists(filename): os.mknod(filename) if download(url,filename): break if tryTimes != 0: print(filename + ": over") else: print(url + " :Failed to download") #控制台显示信息 #这里是图片保存的代码被调函数,timeout=5设置超时时间,一个500k不到的图片,5秒时间算长的了,超时的话,返回失败 def download(url,filename): try: headers = { 'Host':'mm.howkuai.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0', } Url = Request(url,headers=headers) req = urlopen(Url).read() f = open(filename,'wb') f.write(req) f.close() return True except HTTPError as e: print(e) return False except Exception as e: print(e) def loadurl(url): try: conn = urlopen(url,timeout=5) html = conn.read() return html except HTTPError as e: return '' except Exception as e: print("unkown exception in conn.read()") return ''
有时间在来解释代码含义,第一段代码是主函数,分别根据导入的py创建既可。