[定向爬虫] 网络爬虫实例1
程序结构设计:
1.获取网页内容
getHTMLText()
2.获取网络内容信息并存储到合适的数据结构中
fillUnivList()
3.利用数据结构展示并输出结果
printUnivList()
实现代码
import requests from bs4 import BeautifulSoup import bs4 def getHtmlText(url): try: r = requests.get(url) r.raise_for_status() r.encoding = 'utf-8' return r.text except: return '' def fillUnivList(ulist, html): soup = BeautifulSoup(html,'html.parser') trs = soup.find('tbody').children for tr in trs: if isinstance(tr,bs4.element.Tag): #tds = tr.find_all("td") #find_all可用于标签对象 tds = tr('td') #find_all()的简写 ulist.append([tds[0].string,tds[1].string,tds[3].string]) #chr(12288)表示采用中文字符空格填充 def printUnivList(ulist,num): tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}" print(tplt.format("排名","学校名称","总分",chr(12288))) for i in range(num): list = ulist[i] print(tplt.format(list[0],list[1],list[2],chr(12288))) def main(): url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html' ulist = [] r = getHtmlText(url) fillUnivList(ulist, r) printUnivList(ulist, 20) if __name__ == '__main__': main()