(18) python 爬虫实战
一切从最简单开始
峰绘网 :http://www.ifenghui.com/
一个比较好爬的漫画网,之所以选择这个网站,因为查看源代码能直接获得漫画的jpg连接,而且每一话所有的jpg一次性的都展示出来
人气排行榜最高的黑水潭
爬取单话
昆虫学家 上 (28p)
#coding=utf-8 import os import urllib import urllib2 from bs4 import BeautifulSoup request = urllib2.Request("http://www.ifenghui.com/index/comics/read/chapterId/19352.html") response = urllib2.urlopen(request) html_=response.read() soup=BeautifulSoup(html_,"lxml") i=0 for a in soup.find_all(class_="fh-read-img"): i=i+1 num=str(i) url = a.get('ssrc') if not os.path.exists("C:/manhua"): os.mkdir("C:/manhua") file_="C:/manhua/"+num+".jpg" urllib.urlretrieve(url,file_) print '第'+num+'张漫画下载OK' print '下载完成'
把url换成其他话
昆虫学家 下 (28p)
http://www.ifenghui.com/index/comics/read/chapterId/20560.html
也没问题
爬取整部漫画
#coding=utf-8 import os import re import urllib import urllib2 from bs4 import BeautifulSoup request = urllib2.Request("http://www.ifenghui.com/index/comics/manhua/id/3235.html")#漫画目录 response = urllib2.urlopen(request) html_=response.read() soup=BeautifulSoup(html_,"lxml") title_='' tit=soup.title.string #获得标题 for t in tit: if t==' ': break else: title_=title_+t#获得截取后的标题 findAll=soup.find_all('a',attrs={'href':re.compile('^/index/comics/read/chapterId')}) chapter=findAll[3:]#获得全部张节的list chapter.reverse()#倒叙list elementNum=len(chapter)#统计元素个数,用来循环 i=0 if not os.path.exists("D:/manhua"): os.mkdir("D:/manhua")#在D盘下新建一个文件夹 for eachChapter in chapter: i = i + 1 chapterNum = str(i) # 打印漫画下载到第几章 chapterTitle=eachChapter.string#获得每章的标题名 rootUrl='http://www.ifenghui.com'#根目录 chapterUrl=rootUrl+eachChapter.get('href')#根目录+相对地址=每章完整的URL #print chapterTitle#打印每章打印标题名 #print chapterUrl#打印每章标题链接的URL request = urllib2.Request(chapterUrl) response = urllib2.urlopen(request) html_=response.read() soup=BeautifulSoup(html_,"lxml") j=0#以下载图片的顺序重新命名图片名 if not os.path.exists("D:/manhua/" + title_): ##########chapterTitle os.mkdir("D:/manhua/" + title_) # 以本章名新建文件夹##########chapterTitle for a in soup.find_all(class_="fh-read-img"):# j=j+1 pictureNum = str(j) # 打印漫画下载到第几张 pictureUrl = a.get('ssrc')#获得本图片的URL网址 if not os.path.exists("D:/manhua/"+title_+"/"+chapterNum):##########chapterTitle os.mkdir("D:/manhua/"+title_+"/"+chapterNum)#以本章名新建文件夹##########chapterTitle file_="D:/manhua/"+title_+"/"+chapterNum+"/"+pictureNum+".jpg"##########chapterTitle urllib.urlretrieve(pictureUrl,file_)#下载到本地,并重命名 print '第'+chapterNum+'章的第'+pictureNum+'页漫画下载OK' print '第'+chapterNum+'章下载完成' print '所有下载完成'