周报11

大学排名的爬取与展示
# -*- coding:utf-8 -*-
import bs4
import requests
import chardet
from bs4 import BeautifulSoup
import pandas as pd
from matplotlib import pyplot as plt

headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}


def get_from_url(year):
    url = "https://www.shanghairanking.cn/rankings/bcur/"
    yearurl = []
    itemlist = []
    total = pd.DataFrame()
    # 拼接url
    for i in year:
        yearurl.append(url+str(i))

    for i in yearurl:
        res = requests.get(i, headers=headers)
        print(res.encoding)
        res.encoding = chardet.detect(res.content)['encoding']
        res.content.decode(encoding='UTF-8')
        # print(res.text)
        itemlist = itemlist + getUniversList(res.text, i.replace('https://www.shanghairanking.cn/rankings/bcur/',''))
    total = pd.DataFrame(itemlist)
    print(total)
    return total


def write_html_to_csv(s):
    # with open('html.csv', 'w', encoding='utf-8') as f:
    #     f.write(s)
    s.to_csv('html.csv',index=False)


def read_html_from_csv():
    s = pd.read_csv("html.csv")

    return s


def getUniversList(html,year):
    """获取HTML中的数据信息"""
    list_u = []
    soup = BeautifulSoup(html.replace('<!---->', ''), "lxml")
    temp = []
    # 排名 名称 省市 类型 总分 办学层次
    for tr in soup.find('tbody').children:
        temp=[]
        temp.append(year)
        for td in tr:
            if td.string:
                # print(td.string)
                temp.append(str(td.string).replace(' ', '').replace('\n', ''))
            else:
                # print(td.find('a').string)
                temp.append(str(td.find('a').string).replace(' ', '').replace('\n', ''))
        list_u.append({"年份":temp[0], "排名":temp[1], "名称":temp[2], "省市":temp[3], "类型":temp[4], "总分":temp[5],"办学层次":temp[6]})
    return list_u


if __name__ == '__main__':
    year = [2015, 2016, 2017, 2018, 2019, 2020, 2021]
    # year = [2015,2016]
    # a= get_from_url(year)
    write_html_to_csv(get_from_url(year))
    # print(a)

    a = read_html_from_csv()
    print(a)
    b =a[a["年份"]==2015].head(10)
    plt.rcParams['font.sans-serif'] = ['KaiTi', 'SimHei', 'FangSong']  # 汉字字体,优先使用楷体，如果找不到楷体，则使用黑体
    plt.rcParams['font.size'] = 5  # 字体大小
    plt.rcParams['axes.unicode_minus'] = False  # 正常显示负号
    plt.subplot(6,1,1)
    # for x,y in zip(b["名称"],b["排名"]):
    #     plt.text(x+0.1,y,'%.2f'%y,ha='center',va='bottom')
    plt.title('2015')
    plt.plot(b["名称"],b["排名"], label='排名')

    b = a[a["年份"] == 2016].head(10)
    plt.subplot(6, 1, 2)
    plt.title('2016')
    plt.plot(b["名称"], b["排名"], label='排名')

    b = a[a["年份"] == 2017].head(10)
    plt.subplot(6, 1, 3)
    plt.title('2017')
    plt.plot(b["名称"], b["排名"], label='排名')

    b = a[a["年份"] == 2018].head(10)
    plt.subplot(6, 1, 4)
    plt.title('2018')
    plt.plot(b["名称"], b["排名"], label='排名')

    b = a[a["年份"] == 2019].head(10)
    plt.subplot(6, 1, 5)
    plt.title('2019')
    plt.plot(b["名称"], b["排名"], label='排名')

    b = a[a["年份"] == 2020].head(10)
    plt.subplot(6, 1, 6)
    plt.title('2020')
    plt.plot(b["名称"], b["排名"], label='排名')

    plt.show()
    # c =a[[i==2015 and "清华" in k for i,k in zip(a["年份"],a["名称"])]]
    # print(c)
    while True:
        print("1.退出")
        print("2.查询")
        s=input()
        if s=="1":
            break
        else:
            print("请输入年份")
            year=input()
            print("请输入学校")
            school=input()
            c = a[[str(i) == year and school in k for i,k in zip(a["年份"],a["名称"])]]
            print(c)
posted @ 2022-05-15 08:59 我的未来姓栗山阅读(20) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部
我的未来姓栗山

周报11

公告