Web Spider 01
基于python 3
基本应用
import urllib.request
url = 'http://www.douban.com'
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
html = response.read()
html = str(html, 'utf-8')
print(html)
简单登录
import urllib.parse
import urllib.request
import http.cookiejar
# 登录的主页面
host_url = 'http://127.0.0.1:8008/login_page.php'
# post数据接收和处理的页面(我们要向这个页面发送我们构造的Post数据)
post_url = 'http://127.0.0.1:8008/login.php'
#url = 'http://www.douban.com'
# 设置一个cookie处理器,它负责从服务器下载cookie到本地,并且在发送请求时带上本地的cookie
cj = http.cookiejar.LWPCookieJar()
cookie_support = urllib.request.HTTPCookieProcessor(cj)
opener = urllib.request.build_opener(
cookie_support, urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
# 打开登录主页面(他的目的是从页面下载cookie,这样我们在再送post数据时就有cookie了,否则发送不成功)
h = urllib.request.urlopen(host_url)
# 构造header,一般header至少要包含一下两项。这两项是从抓到的包里分析得出的。
headers = {'User-Agent':
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1'
}
post_data = {
'username': '***',
'password': '***'
}
# 需要给Post数据编码
post_data = urllib.parse.urlencode(post_data).encode('utf-8')
# 通过urllib2提供的request方法来向指定Url发送我们构造的数据,并完成登录过程
request = urllib.request.Request(post_url, post_data, headers)
print(request)
response = urllib.request.urlopen(request)
text = response.read()
text = str(text, 'utf-8')
print(text)
save_path = "snatch2.txt"
# save_path 's file unnecessary to be exist
f_obj = open(save_path, 'w')
f_obj.write(text)
print("snatch successfully.")