第一个爬虫:爬取壁纸图片并自动分类存储
这是我写的第一个完整的爬虫,爬取之前定的目标是:爬取一个网站所有桌面壁纸,并且自动分类存储。
我的思路是最先找到图片的URL,分析图片是否可以正常爬取到。如果可以再反向找到分类列表,然后找到每一个集合的所有
图片url,最后找到一张图片的下载url。
代码如下:
#!/usr/bin/env python # -*- coding:utf-8 -*- #Author: ss from bs4 import BeautifulSoup import requests import time import os headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', } def get_image(url,title1,title2,title3,abc): #url = 'http://pic1.win4000.com/wallpaper/2018-05-10/5af43304a2837.jpg' data = requests.get(url=url, headers=headers) time.sleep(1) with open('.\\allbz\\' + title1 + '\\' + title2 +'\\' + title3 + str(abc) + '.jpg','wb+') as f: f.write(data.content) print("已下载{}张".format(abc),url,title3) def get_jpg_url(url,title1,title2): #url = 'http://www.win4000.com/wallpaper_detail_145701.html' data = requests.get(url=url, headers=headers) soup = BeautifulSoup(data.text, 'lxml') time.sleep(2) url = soup.select('img.pic-large')[0].get('src') title3 = soup.select('img.pic-large')[0].get('title') global abc abc += 1 get_image(url,title1,title2,title3,abc) def get_one_url(url,title1,title2): #url ='http://www.win4000.com/wallpaper_detail_145701.html' data = requests.get(url=url, headers=headers) soup = BeautifulSoup(data.text, 'lxml') urls = soup.select('#scroll') for i in urls: data1 = i.select('li > a') for m in data1: url = m.get('href') get_jpg_url(url,title1,title2) def get_list(url,title1): #url ='http://www.win4000.com/wallpaper_195_0_0_1.html' data = requests.get(url=url, headers=headers) soup = BeautifulSoup(data.text, 'lxml') urls = soup.select('div.Left_bar > div.list_cont.Left_list_cont > div > div > div > ul') global abc for i in urls: data1 = i.select('li > a') for m in data1: url = m.get('href') title2 = m.select('img')[0].get('title') if os.path.exists('.\\allbz\\' + title1 + '\\' + title2): continue else: os.mkdir('.\\allbz\\' + title1 + '\\' + title2) print('开始下载{}'.format(title2)) get_one_url(url,title1,title2) def get_fenlei(): url ='http://www.win4000.com/wallpaper.html' data = requests.get(url=url, headers=headers) soup = BeautifulSoup(data.text, 'lxml') urls = soup.select('div.cont2') for i in urls: datas = i.select('a') for m in datas: url = m.get('href') title1 = m.text if title1 == '全部': continue else: if os.path.exists('.\\allbz\\' + title1): continue else: os.mkdir('.\\allbz\\' + title1) print('开始下载{}'.format(title1)) get_list(url,title1) abc = 0 get_fenlei()