通过request获取网页资讯通过BeautifulSoup剖析网页元素

import requests

newsUrl ='http://news.sina.com.cn/china/'

res = requests.get(newsUrl)

res.encoding ='utf-8’

pint

print(res.text)

//然后通过DOM Tree来剖析网页元素

from bs4 import BeautifulSoup

html_sample ='\

<html>\

<body>\

<h1 id="title">this is h1</h1>\

<a class="link" href="fdfdfdfd">this is a link</a>\

<a class="link" href="fdfdfdfd">this is another link</a>\

</body>\

</html>'

'''

html.parser 解析器 ,不写的话会发出警告

'''

soup = BeautifulSoup(html_sample,'html.parser’)

print(soup.text)

#找出所有含特定标签的HTML元素

#1: 使用select 找出含有h1标签的元素

header = soup.select('h1’)

print(header)print(header[0].text )

#第0个标签中的文字

#2: 使用select找出含有a标签的元素

alink = soup.select('a’)

print(alink)

for link in alink:

#print(link)

print(link.text)

#取得含有特定CSS属性的元素

#1使用select找出所有id为title的元素(id前需加#)

aTitle = soup.select('#title')

print(aTitle)

#2使用select找出所有class为link的元素(class前需要加.)

for mylink in soup.select('.link'):

print(mylink)

#取得所有a标签内的链接

#使用select找出所有a tag的href连结

ahref = soup.select('a')

for ah in ahref:

print(ah['href'])

posted on 2017-08-21 14:55 taiyang2014 阅读(448) 评论(0) 收藏举报

刷新页面返回顶部

taiyang2014