爬取糗事百科列表页案例

import requests
from lxml import etree
import re


# 爬取糗事百科所有列表页信息
class Qiushi():
    def __init__(self, base_url):
        self.base_url = base_url
        self.max_page = self.get_max_page()
        self.get_data()

    # 获取最大页数
    def get_max_page(self):
        response = requests.get(self.base_url)
        html_str = response.text
        html = etree.HTML(html_str)
        max_page = html.xpath('//ul[@class="pagination"]/li[last()-1]/a/span/text()')
        max_page = int(max_page[0].strip())
        return max_page

    # 发起请求获取数据
    def get_data(self):
        # 循环获取每一页的数据
        for page in range(1, self.max_page + 1):
            base_url = 'https://www.qiushibaike.com/8hr/page/{}/'.format(page)
            response = requests.get(base_url)
            html_str = response.text
            html = etree.HTML(html_str)
            result = html.xpath('//div[@class="recommend-article"]/ul/li')
            all_list = []
            for site in result:
                # print(type(site))
                # 看看里面是什么
                # print(etree.tostring(site, pretty_print=True,encoding='utf-8').decode('utf-8'))
                qiushi_info = {}
                funny_number = site.xpath('.//div[@class="recmd-num"]/span[1]/text()')  # 搞笑数
                comment_number = site.xpath('.//div[@class="recmd-num"]/span[4]/text()')  # 评论数
                content = site.xpath('.//a[@class="recmd-content"]/text()')             # 内容
                pic = site.xpath('.//a[contains(@class, "recmd-left")]/img/@src')  # 图片
                username = site.xpath('.//span[@class="recmd-name"]/text()')  # 用户昵称
                # all函数 所有的都为真 返回真 只要有一个假 则返回假
                # any函数 只要有一个为真 则返回真
                # 过滤掉广告
                if all([funny_number, comment_number, content, pic, username]):
                    qiushi_info['funny_number'] = funny_number[0]
                    qiushi_info['comment_number'] = comment_number[0]
                    qiushi_info['content'] = content[0]
                    # 拼接图片url
                    pic = "https:" + pic[0]
                    # 获取原始图片大小 有些图片没有问号 就是原始图片
                    if "?" in pic:
                        pattern = re.compile('(.*?)\?')
                        pic = pattern.findall(pic)[0]
                    qiushi_info['pic'] = pic
                    qiushi_info['username'] = username[0]
                    all_list.append(qiushi_info)
            # 整理输出
            print('-------------------第{}页------------------------'.format(page))
            for i in all_list:
                print(i)


if __name__ == "__main__":
    base_url = 'https://www.qiushibaike.com/'
    Qiushi(base_url)

 

posted @ 2018-12-15 01:24  Bob__Zhang  阅读(307)  评论(0编辑  收藏  举报