爬虫15-正则表达式爬取中国诗词网
import requests import re from lxml import etree headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } def parse_page(url): response=requests.get(url,headers=headers) text=response.text # 使用xpath找的标题 # html=etree.HTML(text) # titles=html.xpath("//div[@class='cont']//b/text()") titles =re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL)#re.DOTALL代表.匹配所有字符 dynasties=re.findall(r'<p\sclass="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL) #author=re.findall(r'<p\sclass="source">.*?<a.*>(.*?)</a>',text)#偶然发现很神奇 authors=re.findall(r'<p\sclass="source">.*?<a.*?>.*?<a.*?>(.*?)</a>',text,re.DOTALL) content_tags=re.findall(r'<div\sclass="contson".*?>(.*?)</div>',text,re.DOTALL) contents=[] for content in content_tags: x=re.sub(r'<.*?>',"",content) contents.append(x.strip()) poems=[] for value in zip(titles,dynasties,authors,contents): title,dynasty,author,content=value poem={ 'title':title, 'dynasty':dynasty, 'author':author, 'content':content } poems.append(poem) for poem in poems: print(poem) def main(): for x in range(1,11): url="https://www.gushiwen.org/default_%s.aspx" % x parse_page(url) if __name__ == '__main__': main()