Python爬虫:爬取带隐藏域__EVENTVALIDATION和__VIEWSTATE的form提交网页
原理:
变量__EVENTVALIDATION
和__VIEWSTATE
表示现在页面的状态,其值一般存储在当前页面上。
变量__EVENTTARGET
会被作为第一个参数传入js方法__doPostBack(eventTarget, eventArgument)
,表示是哪一个控件被触发,比如第二个参数为空说明控件被点击。
我们在请求页面时将上面3个变量作为请求参数加入post请求,服务器受到请求后会解析变量,响应请求,返回页面。
# -*- coding: utf-8 -*-
import urllib
import urllib2
user_agent = r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2669.400 QQBrowser/9.6.10990.400'
headers = {r'User-Agent': user_agent}
url = r"http://www.jnta.gov.cn/InfoSeach.aspx?CMID=9&Type=%u666f%u70b9%u4fe1%u606f&KeyWord="
next_button = r'ctl00$ContentPlaceHolder1$ShowListSeach2$DDPager$ctl02$ctl00'
values = {}
data = urllib.urlencode(values)
req = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(req, data)
cookie = response.headers.get('Set-Cookie')
page_html = response.read() #获取首页html
print page_html
values['__EVENTTARGET'] = next_button
values['__EVENTARGUMENT'] = ''
values['__LASTFOCUS'] = ''
values['__VIEWSTATEGENERATOR'] = 'EBDD162D'
values['ctl00$tbKeyWord'] = ''
values['ctl00$ContentPlaceHolder1$Seach1$tbSeachKeyWord'] = ''
values['ctl00$ContentPlaceHolder1$Seach1$ddlModel'] = '9'
values['ctl00$ContentPlaceHolder1$Seach1$ddlType'] = '景点信息'
from lxml import html
req = urllib2.Request(url, headers = headers)
req.add_header('cookie', cookie)
for i in range(1, 5):
page_index_tree = html.fromstring(page_html.decode('utf-8'))
__VIEWSTATE = page_index_tree.cssselect('#__VIEWSTATE')
__EVENTVALIDATION = page_index_tree.cssselect('#__EVENTVALIDATION')
values['__VIEWSTATE'] = __VIEWSTATE[0].get('value')
values['__EVENTVALIDATION'] = __EVENTVALIDATION[0].get('value')
data = urllib.urlencode(values)
response = urllib2.urlopen(req, data)
page_html = response.read() #获取下一页
print page_html