Scraper_compare('NoneType' object has no attribute 'group')

三种解析网页的方法各有所用,各有特点。通过,对比三种方式更能明白在什么情况之下采用什么方法。其中,运行代码时,可能会遇到一个bug(

results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).group()
AttributeError: 'NoneType' object has no attribute 'group'

),这其实是有一个参数没有对,如果删除之后,运行顺利!

# 比较这三种解析的相对性能
import urllib2
import re
from bs4 import BeautifulSoup
import lxml.html
import time
def download(url, user_agent="wswp", num_retries=2):
    print "Download :", url
    headers = {"User_agent": user_agent}
    request = urllib2.Request(url, headers=headers)
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print "Download Error :", e.reason
        html = None
        if num_retries > 0:
            if hasattr(e, "code") and 500 <= e.code < 600:
                return download(url, user_agent, num_retries-1)
    return html
# 如果运行出现“results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).group()
AttributeError: 'NoneType' object has no attribute 'group'”则只需删除FIELDS中的“iso”元素就行。
FIELDS = ("area", "population", "iso", "country", "capital", "continent", "tld", "currency_code", "currency_name", "phone", "postal_code_format", "postal_code_regex", "languages", "neighbours")

# 正则
def re_scraper(html):
    results = {}
    for field in FIELDS:
        results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).groups()[0]
    return results

# BeautifulSoup
def bs_scraper(html):
    soup = BeautifulSoup(html, "html.parser", from_encoding="utf8")
    results = {}
    for field in FIELDS:
        # tr = soup.find(attrs={'id': 'places_%s__row' % field})
        # td = tr.find(attrs={'class':'w2p_fw'})
        # results[field] = td.text
        results[field] = soup.find('table').find('tr', id='places_%s__row' % field).find('td', class_='w2p_fw').text
    return results

# lxml模块
def lxml_scraper(html):
    tree = lxml.html.fromstring(html)
    results = {}
    for field in FIELDS:
        results[field] = tree.cssselect('tr#places_%s__row > td.w2p_fw' % field)[0].text_content()
    return results

# 测试相对性能
NUM_ITERATIONS = 1000
html = download("http://example.webscraping.com/view/United-Kingdom-239")
for name, scraper in [("Regular expressions", re_scraper), ("BeautifulSoup", bs_scraper), ("Lxml", lxml_scraper)]:
    start = time.time()
    for i in range(NUM_ITERATIONS):
        if scraper == re_scraper:
            # 清除正则表达式产生的搜索缓存,使结果更加公平
            re.purge()
        result = scraper(html)
        assert (result['area'] == '244,820 square kilometres')

    end = time.time()
    print "%s: %.2f seconds" % (name, end - start)

输出结果是:

Regular expressions: 3.82 seconds
BeautifulSoup: 25.92 seconds
Lxml: 4.33 seconds

可以看出BeautifulSoup的性能最慢,但是,同时它也是最简单的。不过,在通常情况下,LXML是抓取数据的最好选择,这是因为该方法既快速又健壮,而正则表达式和BeautifulSoup只在某些特定的场景下有用。

posted @ 2017-05-12 10:08  星影L  阅读(710)  评论(0编辑  收藏  举报