爬虫 正则 之 爬取糗事百科的照片

直接上代码:

import re
import requests
# 创建定长数字,作为一会名字
def creat_fivenum(num,n=5):
    for i in range(1,10):
        s=str(num)
        j=10**i
        if (int(num/j)==0):
            for a in range(1,6-i):
                s="0"+s
            return s

#爬取臭事百科
# https://www.qiushibaike.com/imgrank/
# 分析一下照片所在的区域
# <div class="thumb">
# <a href="/article/124066439" target="_blank">
# <img src="//pic.qiushibaike.com/system/pictures/12406/124066439/medium/NKSSOW6NS7WM1L6J.jpg" alt="糗事#124066439" class="illustration" width="100%" height="auto">
# </a>
# </div>
def downQiushiImg(endpage=13):
    """
    只需要传参,一共爬取多少页就行
    :param endpage:
    :return:
    """
    num = 1 # 初始化计数器
    for page in range(endpage):
        url = f"https://www.qiushibaike.com/imgrank/page/{page+1}/"
        res = requests.get(url, headers=headers)
        ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
        r = re.findall(ex, res.text, re.S)
        for i in r:
            url = "http:" + i
            print(url)
            res = requests.get(url, headers=headers)
            with open(f"D://糗事百科image/{creat_fivenum(num)}.png", "wb")as f:
                f.write(res.content)
            num = num + 1;
    print(f"爬取完毕,一共{num-1}个照片")
downQiushiImg()
View Code

注意:再次申明,正则用的不好,是因为你需要指定re.S

posted @ 2021-02-16 23:03    阅读(59)  评论(0编辑  收藏  举报