糗事百科爬虫案例
爬取糗事百科的热门的所有段子的作者、标题、内容链接、好笑数、评论数
# coding=utf-8 from lxml import etree import requests import json class QiubaiSpider: def __init__(self): self.url_temp="https://www.qiushibaike.com/8hr/page/{}/" self.header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"} def get_url_list(self): url_list=[self.url_temp.format(i) for i in range(1,14)] return url_list def parse_url(self,url): print("正在爬取:",url) response=requests.get(url,headers=self.header) return response.content.decode() def get_content_list(self,html_str): html=etree.HTML(html_str) #1.分组 div_list=html.xpath("//div[@class='recommend-article']//li") content_list=[] for div in div_list: item={} item["作者名"]=div.xpath(".//span[@class='recmd-name']/text()")[0] if len(div.xpath(".//span[@class='recmd-name']/text()"))>0 else None item["标题"]=div.xpath(".//a[@class='recmd-content']/text()") item["内容链接"]=div.xpath(".//a[@class='recmd-content']/@href") item["内容链接"]='https://www.qiushibaike.com'+item["内容链接"][0] if len(item["内容链接"])>0 else None item["好笑数"]=div.xpath(".//div[@class='recmd-num']/span/text()") item["好笑数"]=item["好笑数"][0] if len(item["好笑数"])>0 else None item["评论"]=div.xpath(".//div[@class='recmd-num']/span/text()")[-2] #item["评论"]=item["评论"][3] if len(item["评论"])>0 else None content_list.append(item) return content_list #b保存 def save_content(self,content_list): with open("qiubai.txt","a",encoding="utf-8") as f: for content in content_list: f.write(json.dumps(content,ensure_ascii=False)) f.write("\n") print("保存成功") def run(self): #1.根据url地址规律,构造url列表 url_list=self.get_url_list() #2.发送请求,获取响应 for url in url_list: html_str=self.parse_url(url) #3.提取数据 content_list=self.get_content_list(html_str) #4.保存 self.save_content(content_list) if __name__ == '__main__': qiubai=QiubaiSpider() qiubai.run()