【Python】爬取网站图片
import requests import bs4 import urllib.request import urllib import os hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'} DownPath = "C:/Users/Administrator/PycharmProjects/untitled/" c = '.jpg' for x in range(5, 100): newDownPath = DownPath + str(x) +"/" os.mkdir(newDownPath) site = "http://www.meizitu.com/a/" + str(x) + ".html" local_filename, headers = urllib.request.urlretrieve(site) html = open(local_filename) soup = bs4.BeautifulSoup(html,"html5lib") jpg = soup.find_all('img') PhotoNum = 0 for photo in jpg: src = photo.get('src') print(src) PhotoNum += 1 Name = (str(PhotoNum) + c) r = requests.get(src,headers = hdr) with open(newDownPath + Name, 'wb') as fd: for chunk in r.iter_content(): fd.write(chunk) print(src)
2.
import bs4 import urllib.request import urllib import time import os import datetime from io import BytesIO import gzip import requests hdr = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive'} for i in range(1,24000): site = "http://www.netbian.com/desk/"+ str(i+1) +".htm" r = urllib.request.Request(site, headers=hdr) try: response = urllib.request.urlopen(r) except: continue page = response.read() buff = BytesIO(page) f = gzip.GzipFile(fileobj=buff) soup = bs4.BeautifulSoup(f.read()) names = soup.find_all('img') for nam in names: if str(nam).find('title')!=-1: stra = str(nam) bea = stra.find('http') ena = stra.find('jpg') print(stra[bea:ena+3]) if len(stra[bea:ena+3]) > 0: rr = requests.get(stra[bea:ena+3],headers = hdr) with open('L:/img/'+ str(i+1) +'.jpg', 'wb') as fd: for chunk in rr.iter_content(): fd.write(chunk)