简单爬虫

 一.简单爬虫

目标——写一个爬虫来爬取常用搜索引擎(比如百度、搜狗等)的首页。

手段——使用 python 的 requests 、BeautifulSoup4 与 lxml 库。

实现方式—— 

首先,我们要获取到网页,这个不难,可以使用 requests.get() 解决,如下:

 

# coding = utf-8

import requests

url = "http://www.baidu.com"

#设置获取网页的时间限制,超时就报错,防止网络延时而太久没反馈
r = requests.get(url, timeout = 1)

#如果返回状态码为 200,则说明网页连接成功
print(r.status_code)

 在 windows 10 的 cmd 下,运行结果如下

 

# encoding = utf-8
import requests
from lxml import html
from bs4 import BeautifulSoup

# 教训不要给文件取名与库名相同,否则会给程序扫描文件出错(找库时出错)
url = [
    "http://www.baidu.com",
    "http://www.google.com",
    "http://www.sogou.com",
    "http://www.bing.com",
    "http://www.so.com"
]

name = ["baidu", "google", "sogou", "bing", "360"]
'''
for i in range(5): 
    r = requests.get( url[i], timeout = 10)
    r.encoding = 'utf-8'
    tree = html.fromstring(r.text)
    urls = []
    if r.status_code == 200:
        #create html doc and save it
        with open("D:\\{}.html".format(name[i]), 'w', encoding='utf-8') as f:
            f.write(r.text)
        print("This is {} times: Successful!".format(i))
        for i in tree.xpath("//@href"):
            urls.append(i)
        for i in range(len(urls)):
            print(urls[i])

    else:
        print("This is {} times: False!".format(i))


for i in range(20):
    r = requests.get(url[0], timeout = 1)
    print(r.status_code)
r.encoding = 'utf-8'

print("type of text: ", type(r.text))
print("type of content: ", type(r.content))

soup = BeautifulSoup(r)
print(soup.get_text())
#print(r.text)

 处理一个 HTML 文件

a.打印 body 标签的内容

b.获取 body 标签的内容

c.获取 id 为 first 的标签对象

d.获取并打印 HTML 页面的中文字符

 

from lxml import html
from bs4 import BeautifulSoup

html_doc = """ <!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title>菜鸟教程(runoob.com)</title> </head> <body> <h1>我的第一个标题</h1> <p>我的第一个段落。</p> </body> <table border="1"> <tr> <td>row 1, cell 1</td> <td>row 1, cell 2</td> </tr> <tr> <td>row 2, cell 1</td> <td>row 2, cell 2</td> </tr> </table> </html> """ #使用 BeautifulSoup 解析网页,并得到一个 BeautifulSoup 的对象 soup = BeautifulSoup(html_doc) #输出网页源码 print(soup.prettify()) #输出网页中的文本信息 #print(soup.get_text()) text = soup.get_text() print("---------------") #print() #输出网页中文本信息的长度(行数) print(len(soup.contents))

 

牛刀小试

爬取中国大学排名内容,http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html

爬取2015年的信息。

# coding = utf-8
import requests
from bs4 import BeautifulSoup
import pandas as pd

client = MongoClient()
db = client['UnivRanking']
collection = db['UnivRanking']

def save_to_mongo(result):
    try:
        if collection.insert(result):
            print('Save to Mongo')
    except:
        print("错误")

allUniv = []

def getHTMLText(url):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return ""

def fillUnivList(soup):
    data = soup.find_all('tr')
    for tr in data:
        ltd = tr.find_all('td')
        if len(ltd) == 0:
            continue

        singleUniv = []

        for td in ltd:
            singleUniv.append(td.string)

        allUniv.append(singleUniv)

def printUnivList(num):
    
    print("{1:^4}{2:{0}^10}{3:{0}^5}{4:{0}^8}{5:{0}^8}{6:{0}^8}{7:{0}^8}".format(chr(12288), "排名", "学校名称", "省市", "总分", "人才培养得分", "科学研究得分", "社会服务得分"))

    for i in range(num):
        u = allUniv[i]
        print("{1:{0}^4}{2:{0}^10}{3:{0}^5}{4:{0}^8}{5:{0}^10}{6:{0}^10}{7:{0}^10}".format(chr(12288), u[0], u[1], u[2], u[3], u[4], u[5], u[6]))
        
    name = ["排名", "学校名称", "省市", "总分", "人才培养得分", "科学研究得分", "社会服务得分"]
    test=pd.DataFrame(columns=name, data= allUniv)
    print(test)
    test.to_csv('testcsv.csv',encoding='utf-8')
    

def main(num):

    url = "http://www.zuihaodaxue.cn/zuihaodaxuepaiming2015_0.html"

    html = getHTMLText(url)
    soup = BeautifulSoup(html, "html.parser")
    fillUnivList(soup)
    printUnivList(num)
    save_to_mongo(allUniv)

if __name__ == "__main__":
    
    main(100) 

 

posted @ 2019-05-27 11:11  康诚嘉士  阅读(170)  评论(0编辑  收藏  举报