糗事百科段子爬取
import requests from lxml import etree import json class QiuSpider: def __init__(self): self.url_temp = "https://www.qiushibaike.com/text/page/{}/" #初始url self.url_home = "https://www.qiushibaike.com{}" #补全信息url self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"} def get_url_list(self):#url生成 return [self.url_temp.format(i) for i in range(1,14)] #列表生成式生成需要的访问的url def parse_url(self,url): #解析url print(url) response = requests.get(url, headers=self.headers) #发送请求 return response.content.decode() #得到返回 def supplement_text(self,href_str):#获得补全段子信息 supplement_url = self.url_home.format(href_str[0])#拼接url获取完整的url信息 supplement_str = self.parse_url(supplement_url)#解析url html_sup = etree.HTML(supplement_str) return html_sup.xpath("//div[@class='content']/text()")[0]#得到需求信息 def get_content_list(self, html_str): html = etree.HTML(html_str) div_list = html.xpath("//div[@id='content-left']/div") #分组 content_list = [] for div in div_list: item= {} # 糗事百科用户发表内容 if div.xpath(".//div[@class='content']/span[text()='查看全文']"): supplement_href = div.xpath(".//a[@class='contentHerf']/@href") item["content"] = self.supplement_text(supplement_href) else: cont = div.xpath(".//div[@class='content']/span/text()") item["content"] = cont[0].replace("\n","") # 糗事百科用户名字 item["author_name"] = div.xpath(".//div/a/img/@alt") item["author_name"] = item["author_name"][0] if len(item["author_name"]) > 0 else None#防止数据未写入 ##糗事百科用户年龄 item["author_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")#查询div中class属性中包含articleGender的xpath item["author_age"] = item["author_age"][0] if len(item["author_age"]) > 0 else None#防止数据未写入 # 糗事百科性别 item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")#获取class属性 item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon","") if len(item["author_gender"])>0 else None # 糗事百科发表内容中包含的图片 item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src") item["content_img"] = "https:" + item["content_img"][0] if len(item["content_img"]) > 0 else None #糗事百科用户头像 item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src") item["author_img"] = "https:" + item["author_img"][0] if len(item["author_img"]) > 0 else None #糗事百科好笑指数 item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()") item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"]) > 0 else None content_list.append(item) return content_list def save_content_list(self,content_list): #保存 for c in content_list: with open('糗事百科.text', 'a', encoding='utf-8') as f: f.write(json.dumps(c,ensure_ascii=False,indent=2)) f.write("\n") def run(self):#主要逻辑 # 1. url_list url_list = self.get_url_list() # 2. 遍历,发送请求,获取响应 for url in url_list: html_str = self.parse_url(url) # 3. 提取数据 content_list = self.get_content_list(html_str) # 4. 保存 self.save_content_list(content_list) if __name__ == '__main__': qiubai = QiuSpider() qiubai.run()