爬虫案例:中国大学排名(软科)
这是一个动态的网站,只能爬取前30个,如果想要获得全部的数据,必须找到对应的Json
文件
# ref: https://blog.csdn.net/qq_42103091/article/details/118002291
# https://www.shanghairanking.cn/api/pub/v1/bcur?bcur_type=11&year=2021
import requests
from bs4 import BeautifulSoup
import re
import bs4
# ref: https://blog.csdn.net/weixin_44578172/article/details/109340255
para = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
def getUrlText(url):
try:
response = requests.get(url, params=para, timeout=30)
response.raise_for_status()
# response.status_code
response.encoding = response.apparent_encoding
return response.text
except:
print('爬取失败')
def fillUnivList(html, ulist):
soup = BeautifulSoup(html, 'html.parser')
for tr in soup.find('tbody').children:
if isinstance(tr, bs4.element.Tag):
tds = tr('td')
a = tr('a')
# tda_3 = tds[3]
# tda_3.text.strip()
# tda_3.contents[0].strip()
ulist.append([tds[0].string.strip(),
a[0].string.strip(),
tds[2].text.strip(),
tds[3].text.strip(),
tds[4].string.strip()])
return ulist
def printUnivList(ulist, num):
string_format = "{0:^10}\t{1:{5}^10}\t{2:{5}^10}\t{3:^10}\t{4:^10}\t{5:^10}"
print(string_format.format("排名", "学校", "省市", "类型", "综合得分", chr(12288))) # 12288 采用中文字符填充
for i in range(num):
uni = ulist[i]
print(string_format.format(uni[0], uni[1], uni[2], uni[3], uni[4], chr(12288)))
def main():
uinfo = []
url = 'https://www.shanghairanking.cn/rankings/bcur/2021'
html = getUrlText(url)
uinfo = fillUnivList(html, uinfo)
printUnivList(uinfo, num=20)
if __name__ == '__main__':
main()