python 爬虫(五)
下载媒体文件
I 使用urllib.request.urlretrieve方法可以下载文件存为指定文件
from urllib.request import urlretrieve from urllib.request import urlopen from bs4 import BeautifulSoup with urlopen("http://www.pythonscraping.com") as html: bsObj = BeautifulSoup(html,'html.parser') imageLocation = bsObj.find('a',{'id':"logo"}).find("img")["src"] urlretrieve(imageLocation,"logo.jpg")
import os from urllib.request import urlretrieve from urllib.request import urlopen from bs4 import BeautifulSoup downloadDirectory = "downloaded" baseUrl = "http://pythonscraping.com" def getAbsoluteURL(baseUrl, source): if source.startswith("http://www."): url = "http://" + source[11:] elif source.startswith("http://"): url = source elif source.startswith("www."): url = source[4:] url = "http://" + source else: url = baseUrl + "/" + source if baseUrl not in url: return None return url def getDownLoadPath(baseUrl, absoluteUrl, downloadDirecory): path = absoluteUrl.replace("www.","") path = path.replace(baseUrl,"") path = downloadDirectory + path path = path.split("?")[0] directory = os.path.dirname(path) if not os.path.exists(directory): os.makedirs(directory) return path html = urlopen("http://www.pythonscraping.com") bsObj = BeautifulSoup(html,"html.parser") downloadList = bsObj.find_all(src=True) for download in downloadList: fileUrl = getAbsoluteURL(baseUrl, download["src"]) if fileUrl is not None: print(fileUrl) dir = getDownLoadPath(baseUrl,fileUrl,downloadDirectory) print("save: " + dir) urlretrieve(fileUrl,dir)