re 与 BeautifulSoup 速度对比
-
测试代码
import requests import re from time import time from bs4 import BeautifulSoup headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', 'Cache-Control': 'no-cache', 'DNT': '1', 'Host': 'dict.cn', 'Pragma': 'no-cache', 'Proxy-Connection': 'keep-alive', 'Referer': 'http://dict.cn/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' } def get_translation(word): response = requests.get('http://dict.cn/{}'.format(word)) text = response.text a = '' start_time = time() for i in range(10): a = re.findall( r'<li>\s*?<\w+>(\w+\.)</\w+>\s*?<\w+>(.*?)</\w+>\s*?</li>', re.search( '<div class="basic clearfix">(.*?)</div>', text, re.S ).group() ) print(a) print('re: ', time() - start_time, '\n') start_time = time() html1 = BeautifulSoup(text, 'lxml') for i in range(10): a = [i.text for i in html1.select('#content div.main div.word div.basic.clearfix li')[:-1]] print(a) print('BeautifulSoup without split and strip: ', time() - start_time, '\n') start_time = time() html2 = BeautifulSoup(text, 'lxml') for i in range(10): a = [i.text.strip().split('\n') for i in html2.select('#content div.main div.word div.basic.clearfix li')[:-1]] print(a) print('BeautifulSoup with split and strip: ', time() - start_time, '\n') if __name__ == '__main__': get_translation('at')
-
运行结果
[('prep.', '在;向;达'), ('n.', '基普(老挝货币基本单位)(=att);[化]砹(极不稳定放射性元素)'), ('abbr.', '密封的(=airtight)'), ('abbr.', '气温(=air temperature)')] re: 0.002012014389038086 ['\nprep.\n在;向;达\n', '\nn.\n基普(老挝货币基本单位)(=att);[化]砹(极不稳定放射性元素)\n', '\nabbr.\n密封的(=airtight)\n', '\nabbr.\n气温(=air temperature)\n'] BeautifulSoup without split and strip: 1.0653910636901855 [['prep.', '在;向;达'], ['n.', '基普(老挝货币基本单位)(=att);[化]砹(极不稳定放射性元素)'], ['abbr.', '密封的(=airtight)'], ['abbr.', '气温(=air temperature)']] BeautifulSoup with split and strip: 0.8744978904724121
-
总结: re快