beautifulsoup功能

#coding:utf-8
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.error import HTTPError,URLError

def getinfo(url):
    try:
        html = urlopen(url)       #读取网页，html.read()为其源代码
        bsobj = BeautifulSoup(html.read(),"lxml")    #用beautifulsoup读取网页源代码
        title = bsobj.h1          #获取网页title
        nameList = bsobj.findAll("span",{"class":"green"})
        all_theprince = bsobj.findAll(text="the prince")
    except (HTTPError,URLError,ArithmeticError) as e:      #网页错误,服务器不存在,尝试访问未知对象
        return None
    return title,nameList,all_theprince
url_Info = getinfo("http://www.pythonscraping.com/pages/warandpeace.html")
try:
    title = url_Info[0]      #调用getTitle函数，获取网站的title
    print(title)
    nameList = url_Info[1]   #获取nameList
    for name in nameList:     #遍历nameList列表
        print(name.get_text())        #去除标签格式，输出文本
    all_theprince = url_Info[2]
    print(len(all_theprince))

except:
    print("URL could not be found")
posted on 2018-03-13 15:54 学习代码小仓库阅读(167) 评论(0) 收藏举报