Python简易爬虫
# coding: utf-8 import urllib import urllib2 import re import os if __name__=='__main__': print "抓取开始..." j = 1 for i in range(1,35): url='http://www.qiushibaike.com/8hr/page/'+str(i)+'/?s=4981088' header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36'} try: request=urllib2.Request(url=url,headers=header) response=urllib2.urlopen(request) content= response.read() except urllib2.HTTPError as e: print e exit() except urllib2.URLError as e: print e exit() pattern=re.compile('<div class="content">.*?<span>(.*?)</span>.*?</div>',re.S) items=re.findall(pattern,content) path="qiubai" if not os.path.exists(path): os.makedirs(path) for item in items: file_path=path+"/"+str(j)+'.txt' f=open(file_path,'w') item=item.replace('<br/>','\n') f.write(item) f.close() j=j+1 print "内容抓取完成..."
重构后
# coding: utf-8 import urllib import urllib2 import re import os class Spider(object): #构造方法 def __init__(self): self.url='http://www.qiushibaike.com/8hr/page/%s/?s=4981088' self.user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36' #获取网页源代码 def get_page(self,page_index): header={'User-Agent':self.user_agent} try: request=urllib2.Request(url=self.url%str(page_index),headers=header) response=urllib2.urlopen(request) content= response.read() return content except urllib2.HTTPError as e: print e exit() except urllib2.URLError as e: print e exit() #分析网页源代码 def analysis(self,content): pattern = re.compile('<div class="content">.*?<span>(.*?)</span>.*?</div>', re.S) items = re.findall(pattern, content) return items #保存网页源代码 def save(self,items,path,page_index): path = "qiubai" strPage='' if not os.path.exists(path): os.makedirs(path) if page_index<10: strPage='0'+str(page_index) else: strPage=str(page_index) j = 1 strJ='' for item in items: if j<10: strJ='0'+str(j) else: strJ=str(j) file_path = path + "/" + strPage+strJ + '.txt' f = open(file_path, 'w') item = item.replace('<br/>', '\n') f.write(item) f.close() j = j + 1 #运行 def run(self): print '开始抓取内容了...' for i in range(1,35): content=self.get_page(i) items=self.analysis(content) self.save(items,'qiubai',i) print '内容抓取完了...' if __name__=='__main__': spider=Spider() spider.run()