爬虫的学习

1.用requests库的ge()函数反问一个网站20次,打印返回状态,text()内容,计算text()属性和content()属性所返回网页内容的长度。

1 import requests
2 for i in range(20):
3     r=requests.get("http://www.googlen.org/")
4     print(r.raise_for_status())
5     r.encoding='utf-8-sig'
6     file_handle=open('1.txt',mode='w')
7     file_handle.write(r.text)
8     print("{},{}".format(len(r.text),len(r.content)))
9 file_handle.close()

2.中国大学排名网络爬虫

import requests 
from bs4 import BeautifulSoup
import bs4
import pandas as pd
 
def getHTMLText(url):    #爬取最好大学排名网站内容    
    try:        
        r = requests.get(url, timeout = 30)        
        r.raise_for_status()        
        r.encoding = r.apparent_encoding        
        return r.text    
    except:        
        return "" 
def fillUnivList(ulist, html):    #将爬取的内容中的所需内容找出并存入列表    
    soup = BeautifulSoup(html, "html.parser")    
    for tr in soup.find('tbody').children:        
        if isinstance(tr, bs4.element.Tag):            
            tds = tr('td')            
            ulist.append([tds[0].string, tds[1].string, tds[3].string])
    data1 = pd.DataFrame(ulist)
    data1.to_csv('data1.csv')
def printUnivList(ulist, num):    #将信息以列表的形式输出    
    print("{:^10}\t{:^6}\t{:^10}".format("排名", "学校名称", "总分"))    
    for i in range(num):        
        u = ulist[i]
        print("{:^10}\t{:^6}\t{:^10}".format(u[0], u[1], u[2]))
def main():    
    uinfo = []    
    url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'    
    html = getHTMLText(url)    
    fillUnivList(uinfo, html)    
    printUnivList(uinfo, 300) 


main()

 

 

 

 

 

posted @ 2020-05-13 15:49  木彳  阅读(140)  评论(0编辑  收藏  举报