糗事百科段子爬取

import requests
from lxml import etree
import json

class QiuSpider:
    def __init__(self):
        self.url_temp = "https://www.qiushibaike.com/text/page/{}/" #初始url
        self.url_home = "https://www.qiushibaike.com{}" #补全信息url
        self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"}

    def get_url_list(self):#url生成
        return [self.url_temp.format(i) for i in range(1,14)] #列表生成式生成需要的访问的url

    def parse_url(self,url):  #解析url
        print(url)
        response = requests.get(url, headers=self.headers)  #发送请求
        return response.content.decode() #得到返回

    def supplement_text(self,href_str):#获得补全段子信息
        supplement_url = self.url_home.format(href_str[0])#拼接url获取完整的url信息
        supplement_str = self.parse_url(supplement_url)#解析url
        html_sup = etree.HTML(supplement_str)
        return html_sup.xpath("//div[@class='content']/text()")[0]#得到需求信息

    def get_content_list(self, html_str):
        html = etree.HTML(html_str)
        div_list = html.xpath("//div[@id='content-left']/div") #分组
        content_list = []
        for div in div_list:
            item= {}
            # 糗事百科用户发表内容
            if div.xpath(".//div[@class='content']/span[text()='查看全文']"):
                supplement_href = div.xpath(".//a[@class='contentHerf']/@href")
                item["content"] = self.supplement_text(supplement_href)
            else:
                cont = div.xpath(".//div[@class='content']/span/text()")
                item["content"] = cont[0].replace("\n","")
            # 糗事百科用户名字
            item["author_name"] = div.xpath(".//div/a/img/@alt")
            item["author_name"] = item["author_name"][0] if len(item["author_name"]) > 0 else None#防止数据未写入
            ##糗事百科用户年龄
            item["author_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")#查询div中class属性中包含articleGender的xpath
            item["author_age"] = item["author_age"][0] if len(item["author_age"]) > 0 else None#防止数据未写入
            # 糗事百科性别
            item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")#获取class属性
            item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon","") if len(item["author_gender"])>0 else None
            # 糗事百科发表内容中包含的图片
            item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")
            item["content_img"] = "https:" + item["content_img"][0] if len(item["content_img"]) > 0 else None
            #糗事百科用户头像
            item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src")
            item["author_img"] = "https:" + item["author_img"][0] if len(item["author_img"]) > 0 else None
            #糗事百科好笑指数
            item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
            item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"]) > 0 else None
            content_list.append(item)
        return content_list

    def save_content_list(self,content_list): #保存
        for c in content_list:
            with open('糗事百科.text', 'a', encoding='utf-8') as f:
                f.write(json.dumps(c,ensure_ascii=False,indent=2))
                f.write("\n")

    def run(self):#主要逻辑
        # 1. url_list
        url_list = self.get_url_list()
        # 2. 遍历，发送请求，获取响应
        for url in url_list:
            html_str = self.parse_url(url)
            # 3. 提取数据
            content_list = self.get_content_list(html_str)
            # 4. 保存
            self.save_content_list(content_list)

if __name__ == '__main__':
    qiubai = QiuSpider()
    qiubai.run()
posted @ 2019-04-16 14:22 萌萌天天被打阅读(180) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部
萌萌天天被打

糗事百科段子爬取

公告