爬虫纪要
1.urllib 的用法。
#coding=utf-8 import urllib2 import urllib valuse={'username':'zang932347627','password':'xxoo'}#待提交的post参数 url='http://www.mwr.gov.cn/slzx/slyw/'#URL网页 user_agent='Mozilla/4.0(compatible;MSITE 5.5;windows NT)' headers={'User-Agent':user_agent,'Referer':url}#Referer应付防盗链 data=urllib.urlencode(valuse)#编码 request=urllib2.Request(url,data,headers) response=urllib2.urlopen(request) print response.read()
2,代理服务
#coding=utf-8 import urllib2 enable_proxy=True proxy_handler=urllib2.ProxyHandler({'http':'http//some-proxy.com:8080'}) null_proxy_handler=urllib2.ProxyHandler({}) if enable_proxy: opener=urllib2.build_opener(proxy_handler) else: opener=urllib2.build_opener(null_proxy_handler) urllib2.install_opener(opener)
3,模拟登录爬取需要登录才能访问的网页,
#coding=utf-8 import urllib import urllib2 import cookielib filename='cookie.txt'#声明一个MozillaCookiejar对象实例来保存cookie,之后写入文件 cookie=cookielib.MozillaCookieJar(filename) opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) postdata=urllib.urlencode({'username':'zang963469010', 'password':'xxoo.'}) #登录博客园的url url='https://passport.cnblogs.com/user/signin?ReturnUrl=http%3A%2F%2Fwww.cnblogs.com%2F' result=opener.open(url,postdata)#模拟登录,并把cookie保存到变量 cookie.save(ignore_discard=True,ignore_expires=True)#保存cookile到cookie。txt中 url2='http://www.cnblogs.com/zang963469010/'#利用cookie请求访问另一个网站,(登录后才能访问的网站) result=opener.open(url2) print result.read()