Python 中国大学排名定向爬虫

代码来自于中国大学Mooc北京理工大学Pythont教学团队https://www.icourse163.org/learn/BIT-1001870001#/learn/content?type=detail&id=1211970249&cid=1215042961

1.函数版

#中国大学定向爬虫
import requests
from bs4 import BeautifulSoup
import bs4
     
def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
     
def fillUnivList(ulist, html):
    soup = BeautifulSoup(html, "html.parser")
    for tr in soup.find('tbody').children:
        if isinstance(tr, bs4.element.Tag):
            tds = tr('td')
            ulist.append([tds[0].string, tds[1].string, tds[3].string])
     
def printUnivList(ulist, num):
    tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
    print(tplt.format("排名","学校名称","总分",chr(12288)))
    for i in range(num):
        u=ulist[i]
        print(tplt.format(u[0],u[1],u[2],chr(12288)))
         
def main():
    uinfo = []
    #url = 'https://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html'
    url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html'
    html = getHTMLText(url)
    fillUnivList(uinfo, html)
    printUnivList(uinfo, 20) # 20 univs
main()

 

2.修改无函数版用于学习

#中国大学定向爬虫
import requests
from bs4 import BeautifulSoup
import bs4

ulist = [] 
url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html'
try:
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
except:
    print("爬取失败")
html = r.text
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find('tbody').children:
    if isinstance(tr, bs4.element.Tag):
       tds = tr('td')
       ulist.append([tds[0].string, tds[1].string, tds[3].string])
     
tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
print(tplt.format("排名","学校名称","总分",chr(12288))) #使得中文对齐
num = 20
for i in range(num): #打印前20名
    u=ulist[i]
    print(tplt.format(u[0],u[1],u[2],chr(12288)))
print("爬取完毕")

 

posted @ 2019-10-26 17:07  xdd1997  阅读(305)  评论(0编辑  收藏  举报