网络爬虫基础练习
1.利用requests.get(url)获取网页页面的html文件
import requests import bs4 url = 'http:://news.gzcc.cn/html/xiaoyuanxinwen/ response = requests.get(url) status_code = response.status_code content = bs4.BeautifulSoup(response.content.decode("utf-8"), "lxml") element = content.find_all(id='book') print(status_code) print(element)
2.利用BeautifulSoup的HTML解析器,生成结构树
import bs4 exampleFile = open('example.html') exampleSoup = bs4.BeautifulSoup(exampleFile.read(),'html5lib') elems = exampleSoup.select('#author') type(elems) print (elems[0].getText())
3.找出特定标签的html元素
import requests newsurl='http://news.gzcc.cn/html/xiaoyuanxinwen/' res=requests.get(newsurl) res.encoding='utf-8 print(res.text) from bs4 import BeautifulSoup html_sample=‘’ soup=BeautifulSoup(html_sample,'html.parser') print(soup.text)
4.取得含有特定CSS属性的元素
alink = soup.select('#title') print alink # [<h1 id="title">Hello World</h1>] soup = BeautifulSoup(html_sample) for link in soup.select('.link'): print link
5.练习:
取出h1标签的文本
soup = BeautifulSoup(html_sample) header = soup.select('h1') print(header)# [<h1 id="title">Hello World</h1>] print header[0]# <h1 id="title">Hello World</h1> print header[0].text# Hello World
取出a标签的链接
alink = soup.select('a') print alink # [<a class="link" href="#">This is link1</a>, <a class="link" href="#link2">This is link2</a>] for link in alink: print link
取出所有li标签的所有内容
print(soup.li) print(soup.li.string) print(type(soup.li.string)) #<li><!--内容--></li> #<class 'bs4.element.Comment'>
取出一条新闻的标题、链接、发布时间、来源
print(soup.select('div .news-list-title')[0].text) print(soup.select('div .news-list-thumb')[0].parent.attrs.get('href')) print(soup.select('div .news-list-info > span')[0].text) print(soup.select('div .news-list-info > span')[1].text)