爬虫入门------beautifulsoup4.0


以通过搜狗抓取微信公众号文章为例。

先来一段示例:
import re
import time
import urllib2
import sys
import os
from bs4 import BeautifulSoup


class parseHtml:
    filename = ''
    filestr = ''
    filesoup = ''

    def __init__(self, param):
        if (os.path.isfile(param) is False):
            print "[ERR]Not Found File" + param
            sys.exit(1)
        self.filename = param
        self.filestr = open(self.filename, 'r')
        self.filesoup = BeautifulSoup(self.filestr)

    def getRecordTitleAndName(self):

        title = self.filesoup.h2.get_text()
        titlename = title.strip()
        webchat_name = self.filesoup.find(id='post-user').get_text()
        return [titlename, webchat_name]

    def getWechatNumberAndName(self):

        name = self.filesoup.find("strong", {"class": "profile_nickname"}).string.strip()
        number = self.filesoup.find("p", {"class": "profile_account"}).get_text().split()
        return [name, number[1]]

    def getRecordList(self):
        for x in self.filesoup.find_all('script'):
            item = x.get_text()
            if re.search('{.*list.*}', item) is not None:
                data_json_str = '{' + re.search("{.*list.*}", item).group(0) + '}'
                temp = data_json_str.replace(' ', ' ').replace('&', '&')
                return data_json_str[1:-1].encode("utf-8")
        return False

    def getNumberSourceUrl(self):
        if (self.filesoup.find(uigs='account_name_0') is None):
            return False
        account = self.filesoup.find(uigs='account_name_0').get('href')
        return account

 
request_type = sys.argv[1]
request_param = sys.argv[2]

if (request_type is None or request_param is None):
    print "params is illegal.[request_type]" + request_type + "[request_param]" + request_param
    sys.exit(1)
if (request_type == '1'):
    request = parseHtml(request_param)
    url = request.getNumberSourceUrl()
    if (url is False):
        print "[ERR][step1]file is not available " + request_param
        sys.exit(1)
    else:
        print url.encode("utf-8")
        sys.exit(0)
elif (request_type == '2'):
    request = parseHtml(request_param)
    record_list = request.getRecordList()
    numberName = request.getWechatNumberAndName()
    if (record_list is False):
        print "[ERR][step2]file is not available " + request_param
    else:
        print numberName[0].encode("utf-8")
        print numberName[1].encode("utf-8")
        print record_list
        sys.exit(0)
elif (request_type == '3'):
    request = parseHtml(request_param)
    titleAndName = request.getRecordTitleAndName()
    record_content = request.getRecordContent()
    if (record_content is False):
        print "[ERR][step3]file is not available " + request_param
        sys.exit(1)
    else:
        print titleAndName[0].encode("utf-8")
        print titleAndName[1].encode("utf-8")
        print record_content.encode("utf-8")
        sys.exit(0)

代码是根据已下载待本地的文件进行dom解析的过程。

urllib2可以非常好的根据url去获取页面。

posted on 2017-04-07 17:16  丁小宝  阅读(105)  评论(0编辑  收藏  举报