re,xpath,BeautifulSoup三种方法爬取古诗词网上诗歌
re,xpath ,bs4对同一个页面的解析速度
发现re比xpath快接近10倍,xpath比bs4快接近10倍
可见要想追求极致速度,使用正则表达式解析有多重要
1、re解析的代码
# 使用正则表达式解析网页元素 # 关键点:直接找每个个体里面相同位置的元素,用findall一次提取出来到列表中 import requests import re DATA = [] def getHTMLtext(url,headers,timeout=10): try : resp = requests.get(url,headers=headers,timeout=timeout) resp.raise_for_status resp.encoding = 'utf-8' return resp.text except: return '' def reParser(text): name_list = re.findall(r'<div class="yizhu".*?<b>(.*?)</b>',text,re.S) #re.DOTALL dynasty_list = re.findall(r'<p class="source">.*?target="_blank">(.*?)</a>',text,re.S) author_list = re.findall(r'<p class="source">.*?target="_blank">.*?</a>.*?target="_blank">(.*?)</a>',text,re.S) row_content_list = re.findall(r'<div class="contson".*?>(.*?)</div>',text,re.S) content_list = [] for content in row_content_list: temp = re.sub(r'<.*?>','',content) #这里一定要记得不要写成了贪婪匹配哦 content_list.append(temp.strip()) #去除空格 likes_list = re.findall(r'<span> (\d*?)</span>',text,re.S) for value in zip(name_list,dynasty_list,author_list,content_list,likes_list): name,dynasty,author,content,likes = value poetry_dict = { '诗词名':name, '朝代':dynasty, '作者':author, '内容':content, '点赞数':likes } DATA.append(poetry_dict) def print_poetry(data): for every_poetry in data: print(every_poetry['诗词名']) print(every_poetry['朝代'] + ':' + every_poetry['作者'] ) print(every_poetry['内容']) print('有{}人喜欢这首诗(词)哦'.format(every_poetry["点赞数"])) print("\n"+'*'*50+"\n") if __name__ == '__main__': row_url = 'https://www.gushiwen.org/default_{}.aspx' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'} num = input('请输入要爬取的页数(1-100):') for i in range(eval(num)): url = row_url.format(i+1) text = getHTMLtext(url,headers)
if text == '': print('url: {} 访问失败'.format(url)) else: reParser(text) DATA.sort(key=lambda x: int(x['点赞数']),reverse = True) TOP10 = DATA[:10] print_poetry(TOP10)
2、Xpath版本
from lxml import etree DATA = [] def getHTMLtext(url,headers,timeout=10): try : resp = requests.get(url,headers=headers,timeout=timeout) resp.raise_for_status resp.encoding = 'utf-8' return resp.text except: return '' def xpathParser(text): htmlElement = etree.HTML(text) # <class 'lxml.etree._Element'> name_list = htmlElement.xpath('/html/body/div[2]/div[1]/div/div[1]/p[1]/a/b/text()') dynasty_list = htmlElement.xpath('/html/body/div[2]/div[1]/div/div[1]/p[2]/a[1]/text()') author_list = htmlElement.xpath('/html/body/div[2]/div[1]/div/div[1]/p[2]/a[2]/text()') content_list = [] poetries = htmlElement.xpath('//div[@class="contson" and contains(@id,"contson")]') #返回一个列表,里面每一个都是'lxml.etree._Element' # print(etree.tostring(poetries[0],encoding = 'utf-8').decode('utf-8')) for poetry in poetries: row_content = ''.join(poetry.xpath('.//text()'))#这里的.可千万不能掉,否则会忽略掉poetry哦 content_list.append(row_content.replace('\n','')) row_likes_list = htmlElement.xpath('//a[contains(@id,"agood")]/span/text()') likes_list = [int(like.strip()) for like in row_likes_list] for value in zip(name_list,dynasty_list,author_list,content_list,likes_list): name,dynasty,author,content,likes = value poetry_dict = { '诗词名':name, '朝代':dynasty, '作者':author, '内容':content, '点赞数':likes } DATA.append(poetry_dict) def print_poetry(data): for every_poetry in data: print(every_poetry['诗词名']) print(every_poetry['朝代'] + ':' + every_poetry['作者'] ) print(every_poetry['内容']) print('有{}人喜欢这首诗(词)哦'.format(every_poetry["点赞数"])) print("\n"+'*'*50+"\n") if __name__ == '__main__': row_url = 'https://www.gushiwen.org/default_{}.aspx' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'} num = input('请输入要爬取的页数(1-100):') for i in range(eval(num)): url = row_url.format(i+1) text = getHTMLtext(url,headers) if text == '': print('url: {} 访问失败'.format(url)) else: xpathParser(text) DATA.sort(key=lambda x: int(x['点赞数']),reverse = True) TOP10 = DATA[:10] print_poetry(TOP10)
3、bs4版本
# 使用bs4提取网页,先利用find_all解析 import requests from bs4 import BeautifulSoup DATA = [] def getHTMLtext(url,headers,timeout=10): try : resp = requests.get(url,headers=headers,timeout=timeout) resp.raise_for_status resp.encoding = 'utf-8' return resp.text except: return '' def bs4_find_all_Parser(text): soup = BeautifulSoup(text,'lxml') sons = soup.find_all('div',class_ = "sons")[:10] #返回一个<class 'bs4.element.ResultSet'>,每一个元素都是Tag类型 # 注意:上一步里面返回了一些其他的元素,我们可以提取出前面的10项,那是我们需要用到的 for son in sons: name = son.find('b').string print(name) dynasty_author = son.find('p',class_="source").get_text() print(dynasty_author) content = son.find('div',class_="contson").get_text().strip() print(content) like = son.find_all('span')[1].string.strip() print('点赞数:'+like) print('\n'+'*'*30+'\n') if __name__ == '__main__': url = 'https://www.gushiwen.org/default_1.aspx' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'} text = getHTMLtext(url,headers) if text == '': print('url: {} 访问失败'.format(url)) else: bs4_find_all_Parser(text)