python 爬虫004-使用urllib2与正则表达式扒取糗事百科新鲜页首页帖子

面向过程的方式

#!/usr/bin/env python 
# -*- coding: utf-8 -*-

import urllib2
import sys
import re
import os

type = sys.getfilesystemencoding()
if __name__ == '__main__':
    # 1.访问其中一个网页地址,获取网页源代码
    url = 'http://www.qiushibaike.com/textnew/'
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36'
    headers = {'User-Agent': user_agent}
    try:
        req = urllib2.Request(url=url, headers=headers)
        res = urllib2.urlopen(req)
        html = res.read().decode("UTF-8").encode(type)
    except urllib2.HTTPError as e:
        print e
        exit()
    except urllib2.URLError as e:
        print e
        exit()
    # 2.根据抓取到的网页源代码去提取想要的数据,帖子id,帖子内容
    regex_content = re.compile(
        '<div class="article block untagged mb15" id=(.*?)>(?:.*?)<div class="content">(.*?)</div>',
        re.S)
    items = re.findall(regex_content, html)
    for item in items:
        file_name = item[0].strip('\'')
        content = item[1].strip().lstrip('<span>').rstrip('</span>').replace('\n', '').replace(
            '<br/>', '\n')
        # 3.保存抓取的数据到文件中
        path = 'qiubai'
        if not os.path.exists(path):
            os.makedirs(path)
        file_path = path + '/' + file_name + '.txt'
        with open(file_path, 'w') as fp:
            fp.write(content)
            fp.close()

面向对象的方式

#!/usr/bin/env python 
# -*- coding: utf-8 -*-
import urllib2
import re
import os
import sys

type = sys.getfilesystemencoding()


class Spider:
    def __init__(self):
        self.url = 'http://www.qiushibaike.com/textnew/page/%s/?s=4979315'
        self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36'

    # 获取网页源代码
    def get_page(self, page_index):
        headers = {'User-Agent': self.user_agent}
        try:
            req = urllib2.Request(url=self.url % str(page_index), headers=headers)
            res = urllib2.urlopen(req)
            html = res.read().decode("UTF-8").encode(type)
            return html
        except urllib2.HTTPError as e:
            print e
            exit()
        except urllib2.URLError as e:
            print e
            exit()

    # 分析网页源代码
    def analysis(self, html):
        regex_content = re.compile(
            '<div class="article block untagged mb15" id=(.*?)>(?:.*?)<div class="content">(.*?)</div>',
            re.S)
        items = re.findall(regex_content, html)
        return items

    # 保存抓取的数据到文件中
    def save(self, items, path):
        if not os.path.exists(path):
            os.makedirs(path)
        for item in items:
            file_name = item[0].strip('\'')
            content = item[1].strip().lstrip('<span>').rstrip('</span>').replace('\n', '').replace(
                '<br/>', '\n')
            file_path = path + '/' + file_name + '.txt'
            with open(file_path, 'w') as fp:
                fp.write(content)
                fp.close()

    # 运行的方法
    def run(self):
        print u'开始抓取内容...'
        for i in range(1, 3):
            content = self.get_page(i)
            items = self.analysis(content)
            self.save(items, 'qiubai')
        print u'内容抓取完毕...'


if __name__ == '__main__':
    sp = Spider()
    sp.run()

 


***微信扫一扫,关注“python测试开发圈”,了解更多测试教程!***
posted @ 2017-05-03 15:37  鲨鱼逛大街  阅读(522)  评论(0编辑  收藏  举报