网页爬取
1、图片爬取代码
import requests import os root = "H:/美图/" url = "https://k.zol-img.com.cn/sjbbs/7692/a7691501_s.jpg" path = root + url.split("/")[-1] try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(url) r.raise_for_status() with open(path, "wb") as f: f.write(r.content) f.close() print("文件保存成功") else: print("文件已经存在") except: print("爬取失败")
2.打印大学排名
import requests from bs4 import BeautifulSoup import bs4 def getHTMLText(url): try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "" def parserHTML(ulist,html): soup = BeautifulSoup(html, "html.parser") for tr in soup.find("tbody").children: if isinstance(tr, bs4.element.Tag): tds = tr("td") ulist.append([tds[0].string,tds[1].string,tds[3].string]) def printUnivList(ulist,num): print("{0:^10}\t{1:{3}^8}\t{2:^8}".format("排名","学校","总分",chr(12288))) for i in range(num): print("{0:^10}\t{1:{3}^10}\t{2:^10}".format(ulist[i][0],ulist[i][1],ulist[i][2],chr(12288))) def main(): url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html" html = getHTMLText(url) ulist = [] parserHTML(ulist,html) printUnivList(ulist,10) main()
打印大学列表时,使用format()函数时,注意填充的字符是中文还是英文字符,使用chr(12288)填充中文字符