Scrapy学习-9-FromRequest
用FromRequest模拟登陆知乎网站
实例
默认登陆成功以后的请求都会带上cookie
# -*- coding: utf-8 -*- import re import json import datetime try: import urlparse as parse except: from urllib import parse import scrapy class ZhihuSpider(scrapy.Spider): name = "zhihu" allowed_domains = ["www.zhihu.com"] start_urls = ['https://www.zhihu.com/'] headers = { "HOST": "www.zhihu.com", "Referer": "https://www.zhizhu.com", 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" } def parse(self, response): pass def start_requests(self): return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)] def login(self, response): response_text = response.text match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL) xsrf = '' if match_obj: xsrf = (match_obj.group(1)) if xsrf: post_url = "https://www.zhihu.com/login/phone_num" post_data = { "_xsrf": xsrf, "phone_num": "18782902568", "password": "admin123" } return [scrapy.FormRequest( url = post_url, formdata = post_data, headers=self.headers, callback=self.check_login )] def check_login(self, response): #验证服务器的返回数据判断是否成功 text_json = json.loads(response.text) if "msg" in text_json and text_json["msg"] == "登录成功": for url in self.start_urls: yield scrapy.Request(url, dont_filter=True, headers=self.headers)