周报11

大学排名的爬取与展示
# -*- coding:utf-8 -*-
import bs4
import requests
import chardet
from bs4 import BeautifulSoup
import pandas as pd
from matplotlib import pyplot as plt

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}


def get_from_url(year):
url = "https://www.shanghairanking.cn/rankings/bcur/"
yearurl = []
itemlist = []
total = pd.DataFrame()
# 拼接url
for i in year:
yearurl.append(url+str(i))

for i in yearurl:
res = requests.get(i, headers=headers)
print(res.encoding)
res.encoding = chardet.detect(res.content)['encoding']
res.content.decode(encoding='UTF-8')
# print(res.text)
itemlist = itemlist + getUniversList(res.text, i.replace('https://www.shanghairanking.cn/rankings/bcur/',''))
total = pd.DataFrame(itemlist)
print(total)
return total


def write_html_to_csv(s):
# with open('html.csv', 'w', encoding='utf-8') as f:
# f.write(s)
s.to_csv('html.csv',index=False)


def read_html_from_csv():
s = pd.read_csv("html.csv")

return s


def getUniversList(html,year):
"""获取HTML中的数据信息"""
list_u = []
soup = BeautifulSoup(html.replace('<!---->', ''), "lxml")
temp = []
# 排名 名称 省市 类型 总分 办学层次
for tr in soup.find('tbody').children:
temp=[]
temp.append(year)
for td in tr:
if td.string:
# print(td.string)
temp.append(str(td.string).replace(' ', '').replace('\n', ''))
else:
# print(td.find('a').string)
temp.append(str(td.find('a').string).replace(' ', '').replace('\n', ''))
list_u.append({"年份":temp[0], "排名":temp[1], "名称":temp[2], "省市":temp[3], "类型":temp[4], "总分":temp[5],"办学层次":temp[6]})
return list_u


if __name__ == '__main__':
year = [2015, 2016, 2017, 2018, 2019, 2020, 2021]
# year = [2015,2016]
# a= get_from_url(year)
write_html_to_csv(get_from_url(year))
# print(a)

a = read_html_from_csv()
print(a)
b =a[a["年份"]==2015].head(10)
plt.rcParams['font.sans-serif'] = ['KaiTi', 'SimHei', 'FangSong'] # 汉字字体,优先使用楷体,如果找不到楷体,则使用黑体
plt.rcParams['font.size'] = 5 # 字体大小
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号
plt.subplot(6,1,1)
# for x,y in zip(b["名称"],b["排名"]):
# plt.text(x+0.1,y,'%.2f'%y,ha='center',va='bottom')
plt.title('2015')
plt.plot(b["名称"],b["排名"], label='排名')

b = a[a["年份"] == 2016].head(10)
plt.subplot(6, 1, 2)
plt.title('2016')
plt.plot(b["名称"], b["排名"], label='排名')

b = a[a["年份"] == 2017].head(10)
plt.subplot(6, 1, 3)
plt.title('2017')
plt.plot(b["名称"], b["排名"], label='排名')

b = a[a["年份"] == 2018].head(10)
plt.subplot(6, 1, 4)
plt.title('2018')
plt.plot(b["名称"], b["排名"], label='排名')

b = a[a["年份"] == 2019].head(10)
plt.subplot(6, 1, 5)
plt.title('2019')
plt.plot(b["名称"], b["排名"], label='排名')

b = a[a["年份"] == 2020].head(10)
plt.subplot(6, 1, 6)
plt.title('2020')
plt.plot(b["名称"], b["排名"], label='排名')

plt.show()
# c =a[[i==2015 and "清华" in k for i,k in zip(a["年份"],a["名称"])]]
# print(c)
while True:
print("1.退出")
print("2.查询")
s=input()
if s=="1":
break
else:
print("请输入年份")
year=input()
print("请输入学校")
school=input()
c = a[[str(i) == year and school in k for i,k in zip(a["年份"],a["名称"])]]
print(c)
posted @ 2022-05-15 08:59  我的未来姓栗山  阅读(20)  评论(0编辑  收藏  举报