大学排名爬虫

# 利用 requests 库爬取网页内容
import requests
from bs4 import BeautifulSoup
import bs4


def get_text(url):
try:
r = requests.get(url)
r.raise_for_status() # 判断状态码是否为200
r.encoding = r.apparent_encoding # 使返回的编码准确
return r.text
except:
print("异常") # 如果状态码不是200就会产生异常
return " "


def university_list(ulist, html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find('tbody').children: # 查找的子节点
if isinstance(tr, bs4.element.Tag): # 剔除不需要的信息
tds = tr('td')
# 要用strip()去除文本中的空格,不然内容为none时会报错
# 把td信息存入ulist中
ulist.append([tds[0].text.strip(), tds[1].text.strip(), tds[2].text.strip(), tds[3].text.strip(), tds[4].text.strip()])


def university_rank(ulist, num):
# {5}:中间的部分用第五个元素填充,即chr(12288)所表示的空格
template = "{0:^10}\t{1:{5}^10}\t{2:{5}^10}\t{3:{5}^10}\t{4:^10}"
print(template.format("排名", "学校名称", "省市", "类型", "总分", chr(12288))) # chr(12288)中文空格
for i in range(num):
u = ulist[i]
print(template.format(u[0], u[1], u[2], u[3], u[4], chr(12288)))


def main():
university_info = []
url = "http://www.shanghairanking.cn/rankings/bcur/2020"
html = get_text(url)
university_list(university_info, html)
university_rank(university_info, 76)


if __name__ == '__main__':
main()

转载https://blog.csdn.net/cjx_up/article/details/77883892

posted @ 2020-11-09 18:02  太阳花2020  阅读(132)  评论(0编辑  收藏  举报