爬虫入门------beautifulsoup4.0
以通过搜狗抓取微信公众号文章为例。
先来一段示例:
import re import time import urllib2 import sys import os from bs4 import BeautifulSoup class parseHtml: filename = '' filestr = '' filesoup = '' def __init__(self, param): if (os.path.isfile(param) is False): print "[ERR]Not Found File" + param sys.exit(1) self.filename = param self.filestr = open(self.filename, 'r') self.filesoup = BeautifulSoup(self.filestr) def getRecordTitleAndName(self): title = self.filesoup.h2.get_text() titlename = title.strip() webchat_name = self.filesoup.find(id='post-user').get_text() return [titlename, webchat_name] def getWechatNumberAndName(self): name = self.filesoup.find("strong", {"class": "profile_nickname"}).string.strip() number = self.filesoup.find("p", {"class": "profile_account"}).get_text().split() return [name, number[1]] def getRecordList(self): for x in self.filesoup.find_all('script'): item = x.get_text() if re.search('{.*list.*}', item) is not None: data_json_str = '{' + re.search("{.*list.*}", item).group(0) + '}' temp = data_json_str.replace(' ', ' ').replace('&', '&') return data_json_str[1:-1].encode("utf-8") return False def getNumberSourceUrl(self): if (self.filesoup.find(uigs='account_name_0') is None): return False account = self.filesoup.find(uigs='account_name_0').get('href') return account request_type = sys.argv[1] request_param = sys.argv[2] if (request_type is None or request_param is None): print "params is illegal.[request_type]" + request_type + "[request_param]" + request_param sys.exit(1) if (request_type == '1'): request = parseHtml(request_param) url = request.getNumberSourceUrl() if (url is False): print "[ERR][step1]file is not available " + request_param sys.exit(1) else: print url.encode("utf-8") sys.exit(0) elif (request_type == '2'): request = parseHtml(request_param) record_list = request.getRecordList() numberName = request.getWechatNumberAndName() if (record_list is False): print "[ERR][step2]file is not available " + request_param else: print numberName[0].encode("utf-8") print numberName[1].encode("utf-8") print record_list sys.exit(0) elif (request_type == '3'): request = parseHtml(request_param) titleAndName = request.getRecordTitleAndName() record_content = request.getRecordContent() if (record_content is False): print "[ERR][step3]file is not available " + request_param sys.exit(1) else: print titleAndName[0].encode("utf-8") print titleAndName[1].encode("utf-8") print record_content.encode("utf-8") sys.exit(0)
代码是根据已下载待本地的文件进行dom解析的过程。
urllib2可以非常好的根据url去获取页面。