欢迎来到RankFan的Blogs

扩大
缩小

爬虫案例:中国大学排名(软科)

这是一个动态的网站,只能爬取前30个,如果想要获得全部的数据,必须找到对应的Json文件

# ref: https://blog.csdn.net/qq_42103091/article/details/118002291
# https://www.shanghairanking.cn/api/pub/v1/bcur?bcur_type=11&year=2021
import requests
from bs4 import BeautifulSoup
import re
import bs4

# ref: https://blog.csdn.net/weixin_44578172/article/details/109340255

para = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}

def getUrlText(url):
    try:
        response = requests.get(url, params=para, timeout=30)
        response.raise_for_status()
        # response.status_code
        response.encoding = response.apparent_encoding
        return response.text
    except:
        print('爬取失败')

def fillUnivList(html, ulist):
    soup = BeautifulSoup(html, 'html.parser')
    for tr in soup.find('tbody').children:
        if isinstance(tr, bs4.element.Tag):
            tds = tr('td')
            a = tr('a')
            # tda_3 = tds[3]
            # tda_3.text.strip()
            # tda_3.contents[0].strip()
            ulist.append([tds[0].string.strip(),
                         a[0].string.strip(),
                         tds[2].text.strip(),
                         tds[3].text.strip(),
                         tds[4].string.strip()])
    return ulist

def printUnivList(ulist, num):
    string_format = "{0:^10}\t{1:{5}^10}\t{2:{5}^10}\t{3:^10}\t{4:^10}\t{5:^10}"
    print(string_format.format("排名", "学校", "省市", "类型", "综合得分", chr(12288))) # 12288 采用中文字符填充
    for i in range(num):
        uni = ulist[i]
        print(string_format.format(uni[0], uni[1], uni[2], uni[3], uni[4], chr(12288)))

def main():
    uinfo = []
    url = 'https://www.shanghairanking.cn/rankings/bcur/2021'
    html = getUrlText(url)
    uinfo = fillUnivList(html, uinfo)
    printUnivList(uinfo, num=20)

if __name__ == '__main__':
    main()

posted on 2021-09-04 14:31  RankFan  阅读(268)  评论(0编辑  收藏  举报

导航