beautifulsoup功能

#coding:utf-8
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.error import HTTPError,URLError

def getinfo(url):
try:
html = urlopen(url) #读取网页,html.read()为其源代码
bsobj = BeautifulSoup(html.read(),"lxml") #用beautifulsoup读取网页源代码
title = bsobj.h1 #获取网页title
nameList = bsobj.findAll("span",{"class":"green"})
all_theprince = bsobj.findAll(text="the prince")
except (HTTPError,URLError,ArithmeticError) as e: #网页错误,服务器不存在,尝试访问未知对象
return None
return title,nameList,all_theprince
url_Info = getinfo("http://www.pythonscraping.com/pages/warandpeace.html")
try:
title = url_Info[0] #调用getTitle函数,获取网站的title
print(title)
nameList = url_Info[1] #获取nameList
for name in nameList: #遍历nameList列表
print(name.get_text()) #去除标签格式,输出文本
all_theprince = url_Info[2]
print(len(all_theprince))

except:
print("URL could not be found")

posted on 2018-03-13 15:54  学习代码小仓库  阅读(160)  评论(0编辑  收藏  举报