爬_01 300页逗图

# 统一资源定位符 url
baseUrl ="https://www.doutula.com/article/list/?page=" 
headers ={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
    'Cookie': 'Hm_lvt_2fc12699c699441729d4b335ce117f40=1607422895; _agep=1607422895; _agfp=71f4f425ea56fba0af9404dcbd130fd1; _agtk=4a5f8ee4d308915b44de47a4c1e0916a; Hm_lpvt_2fc12699c699441729d4b335ce117f40=1607423412',
}
for i in range(301):
    #获取300页的逗图
    url = baseUrl+str(i)
    html_str = requests.get(url,headers=headers).text
    #获取字符串格式的html 注意字符串是用的text
    image_urls = re.findall(r'data-original="(.*?)"',html_str)
    print(image_urls)
    for image_url in image_urls:
        print(image_url)
        image = requests.get(image_url, headers=headers).content
        # 获取2进制文件使用的是content
        image_names = image_url.split('/')[-1]
        with open(f'./doutu/{image_names}','wb') as file:
        打开当前路径下的doutu文件夹,并存储文件
            file.write(image)
posted @ 2020-12-09 09:10  wshf  阅读(59)  评论(0编辑  收藏  举报