第7课-正则表达式实现爬虫实战
1、古诗文网爬虫
import requests,re headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36" } my_poetic_list = [] def get_poetics(my_url): text = requests.get(url=my_url,headers = headers).text titles = re.findall(r'<div class="cont">.*?<b>(.*?)</b>',text,re.DOTALL) years = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL) potes = re.findall(r'<p class="source">.*?<a.*?>.*?</a>.*?<a.*?>(.*?)</a>',text,re.DOTALL) poetic = re.findall(r'<div class="contson".*?>(.*?)</div>',text,re.DOTALL) poetic_list = [] for i in poetic: i = re.sub(r"<.*?>",'',i).replace("\n",'').replace("\u3000",'') poetic_list.append(i) for key,value in enumerate(titles): my_poetic = {} my_poetic["title"] = titles[key] my_poetic["year"] = years[key] my_poetic["pote"] = potes[key] my_poetic["poetics"] = poetic_list[key] my_poetic_list.append(my_poetic) if __name__ == '__main__': for i in range(0,11): url = "https://www.gushiwen.org/default_{}.aspx".format(i) get_poetics(url) for i in my_poetic_list: print(i)
2、糗事百科案例
import re,requests headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36" } my_lovehhy = [] def get_acticles(url): text = requests.get(url=url,headers=headers).text titles = re.findall(r'<h3.*?><a.*?>(.*?)</a>',text,re.DOTALL) articles = re.findall(r'<div id="endtext">(.*?)</div>',text,re.DOTALL) for key,article in enumerate(articles): article = re.sub(r"<.*?>",'',article).replace("\u3000",'') lovehhy = {} lovehhy["title"] = titles[key] lovehhy["content"] = article my_lovehhy.append(lovehhy) if __name__ == '__main__': for i in range(10): url = "http://www.lovehhy.net/Joke/Detail/QSBK/{}".format(i) get_acticles(url) for i in my_lovehhy: print(i)