Python爬虫:爬取带隐藏域__EVENTVALIDATION和__VIEWSTATE的form提交网页

原理:

变量__EVENTVALIDATION__VIEWSTATE表示现在页面的状态,其值一般存储在当前页面上。
变量__EVENTTARGET会被作为第一个参数传入js方法__doPostBack(eventTarget, eventArgument),表示是哪一个控件被触发,比如第二个参数为空说明控件被点击。
我们在请求页面时将上面3个变量作为请求参数加入post请求,服务器受到请求后会解析变量,响应请求,返回页面。

# -*- coding: utf-8 -*-

import urllib
import urllib2
user_agent = r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2669.400 QQBrowser/9.6.10990.400'
headers = {r'User-Agent': user_agent}
url = r"http://www.jnta.gov.cn/InfoSeach.aspx?CMID=9&Type=%u666f%u70b9%u4fe1%u606f&KeyWord="
next_button = r'ctl00$ContentPlaceHolder1$ShowListSeach2$DDPager$ctl02$ctl00'
values = {}
data = urllib.urlencode(values)
req = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(req, data)
cookie = response.headers.get('Set-Cookie')
page_html = response.read() #获取首页html
print page_html

values['__EVENTTARGET'] = next_button
values['__EVENTARGUMENT'] = ''
values['__LASTFOCUS'] = ''
values['__VIEWSTATEGENERATOR'] = 'EBDD162D'
values['ctl00$tbKeyWord'] = ''
values['ctl00$ContentPlaceHolder1$Seach1$tbSeachKeyWord'] = ''
values['ctl00$ContentPlaceHolder1$Seach1$ddlModel'] = '9'
values['ctl00$ContentPlaceHolder1$Seach1$ddlType'] = '景点信息'
from lxml import html

req = urllib2.Request(url, headers = headers)
req.add_header('cookie', cookie)
for i in range(1, 5):
    page_index_tree = html.fromstring(page_html.decode('utf-8'))
    __VIEWSTATE = page_index_tree.cssselect('#__VIEWSTATE')
    __EVENTVALIDATION = page_index_tree.cssselect('#__EVENTVALIDATION')
    values['__VIEWSTATE'] = __VIEWSTATE[0].get('value')
    values['__EVENTVALIDATION'] = __EVENTVALIDATION[0].get('value')

    data = urllib.urlencode(values)
    response = urllib2.urlopen(req, data)
    page_html = response.read() #获取下一页
    print page_html
posted @ 2018-12-28 09:04  xuejianbest  阅读(2486)  评论(0编辑  收藏  举报