spider数据抓取(第二章)
download最完善的脚本
import urllib2 import urlparse def download(url, user_agent="wswp", proxy=None, num_retries=2): print "DownLoading", url headers = {"User-agent": user_agent} request = urllib2.Request(url, headers=headers) opener = urllib2.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) try: html = opener.open(request).read() print 1 except urllib2.URLError as e: print "download error", e.reason html = None if num_retries > 0: if hasattr(e, "code") and 500 <= e.code <600: # retry 5xx http error html = download(url, user_agent, proxy, num_retries-1) return html
三种网页抓取的方法
1.用正则抓取数据
url = "http://example.webscraping.com/view/United-Kingdom-239" html = download(url) # print html print re.findall('<td class="w2p_fw">(.*?)</td>', html)
2.bs4抓取
bs4抓取(实验)
from bs4 import BeautifulSoup broken_html = "<ul class=country><li>Area</li><li>Population</ul>" # parse html soup = BeautifulSoup(broken_html, "html.parser") # 整个html,自动补全确实的标签 # fixed_html = soup.prettify() # print fixed_html ul = soup.find("ul", attrs={'class': "country"}) # 匹配country print ul.find("li") # 只取一个 print ul.find_all("li") # 取全部
bs4正式抓取
评价:这种方法比正则的代码量大,但是可以通过beautifulsoup补全标签的缺失
from bs4 import BeautifulSoup url = "http://example.webscraping.com/view/United-Kingdom-239" html = download(url) soup = BeautifulSoup(html) # locate the area row print soup tr = soup.find(attrs={"id": "places_area__row"}) print tr td = tr.find(attrs={"class": "w2p_fw"}) # locate the area tag area = td.text # extract the text from this tag print area
3.lxml抓取
pip install lxml
可以统一不合法的html同bs4一样可以补全缺失
该模块用C语言写的,解析速度比bs4快
import lxml.html broken_html = "<ul class=country><li>Area</li><li>Population</ul>" tree = lxml.html.fromstring(broken_html) # parse the HTML fixed_html = lxml.html.tostring(tree, pretty_print=True) print fixed_html
lxml的XPath选择器类似于bs4的find()
下述采用css选择器,更加简洁方便(依托jQuery)
意义同上bs4的正式抓取
import lxml.html broken_html = "<ul class=country><li>Area</li><li>Population</ul>" # 网页 tree = lxml.html.fromstring(broken_html) td = tree.cssselect("tr#places_area__row>td.w2p_fw")[0] area = td.text_content() print area