Python简易爬虫

#  coding: utf-8
import urllib
import urllib2
import re
import os

if __name__=='__main__':
    print "抓取开始..."
    j = 1
    for i in range(1,35):
        url='http://www.qiushibaike.com/8hr/page/'+str(i)+'/?s=4981088'
        header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36'}
        try:
            request=urllib2.Request(url=url,headers=header)
            response=urllib2.urlopen(request)
            content= response.read()
        except urllib2.HTTPError as e:
            print e
            exit()
        except urllib2.URLError as e:
            print e
            exit()
        pattern=re.compile('<div class="content">.*?<span>(.*?)</span>.*?</div>',re.S)
        items=re.findall(pattern,content)
        path="qiubai"
        if not os.path.exists(path):
            os.makedirs(path)
        for item in items:
            file_path=path+"/"+str(j)+'.txt'
            f=open(file_path,'w')
            item=item.replace('<br/>','\n')
            f.write(item)
            f.close()
            j=j+1
    print "内容抓取完成..."

 

重构后

#  coding: utf-8
import urllib
import urllib2
import re
import os

class Spider(object):
    #构造方法
    def __init__(self):
        self.url='http://www.qiushibaike.com/8hr/page/%s/?s=4981088'
        self.user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36'

    #获取网页源代码
    def get_page(self,page_index):
        header={'User-Agent':self.user_agent}
        try:
            request=urllib2.Request(url=self.url%str(page_index),headers=header)
            response=urllib2.urlopen(request)
            content= response.read()
            return content
        except urllib2.HTTPError as e:
            print e
            exit()
        except urllib2.URLError as e:
            print e
            exit()

    #分析网页源代码
    def analysis(self,content):
        pattern = re.compile('<div class="content">.*?<span>(.*?)</span>.*?</div>', re.S)
        items = re.findall(pattern, content)
        return items

        #保存网页源代码
    def save(self,items,path,page_index):
        path = "qiubai"
        strPage=''
        if not os.path.exists(path):
            os.makedirs(path)
        if page_index<10:
            strPage='0'+str(page_index)
        else:
            strPage=str(page_index)
        j = 1
        strJ=''
        for item in items:
            if j<10:
                strJ='0'+str(j)
            else:
                strJ=str(j)
            file_path = path + "/" + strPage+strJ + '.txt'
            f = open(file_path, 'w')
            item = item.replace('<br/>', '\n')
            f.write(item)
            f.close()
            j = j + 1

    #运行
    def run(self):
        print '开始抓取内容了...'
        for i in range(1,35):
            content=self.get_page(i)
            items=self.analysis(content)
            self.save(items,'qiubai',i)
        print '内容抓取完了...'


if __name__=='__main__':
    spider=Spider()
    spider.run()

 

posted @ 2017-05-09 20:36  框框A  阅读(630)  评论(0编辑  收藏  举报