Python 中国大学排名定向爬虫
代码来自于中国大学Mooc北京理工大学Pythont教学团队:https://www.icourse163.org/learn/BIT-1001870001#/learn/content?type=detail&id=1211970249&cid=1215042961
1.函数版
#中国大学定向爬虫 import requests from bs4 import BeautifulSoup import bs4 def getHTMLText(url): try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "" def fillUnivList(ulist, html): soup = BeautifulSoup(html, "html.parser") for tr in soup.find('tbody').children: if isinstance(tr, bs4.element.Tag): tds = tr('td') ulist.append([tds[0].string, tds[1].string, tds[3].string]) def printUnivList(ulist, num): tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}" print(tplt.format("排名","学校名称","总分",chr(12288))) for i in range(num): u=ulist[i] print(tplt.format(u[0],u[1],u[2],chr(12288))) def main(): uinfo = [] #url = 'https://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html' url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html' html = getHTMLText(url) fillUnivList(uinfo, html) printUnivList(uinfo, 20) # 20 univs main()
2.修改无函数版用于学习
#中国大学定向爬虫 import requests from bs4 import BeautifulSoup import bs4 ulist = [] url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html' try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding except: print("爬取失败") html = r.text soup = BeautifulSoup(html, "html.parser") for tr in soup.find('tbody').children: if isinstance(tr, bs4.element.Tag): tds = tr('td') ulist.append([tds[0].string, tds[1].string, tds[3].string]) tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}" print(tplt.format("排名","学校名称","总分",chr(12288))) #使得中文对齐 num = 20 for i in range(num): #打印前20名 u=ulist[i] print(tplt.format(u[0],u[1],u[2],chr(12288))) print("爬取完毕")
本文来自博客园,作者:xdd1997
转载请注明:https://www.cnblogs.com/xdd1997/p/11743826.html