Request
Request 部分源码:
# 部分代码 class Request(object_ref): def __init__(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None): self._encoding = encoding # this one has to be set first self.method = str(method).upper() self._set_url(url) self._set_body(body) assert isinstance(priority, int), "Request priority not an integer: %r" % priority self.priority = priority assert callback or not errback, "Cannot use errback without a callback" self.callback = callback self.errback = errback self.cookies = cookies or {} self.headers = Headers(headers or {}, encoding=encoding) self.dont_filter = dont_filter self._meta = dict(meta) if meta else None @property def meta(self): if self._meta is None: self._meta = {} return self._meta
其中,比较常用的参数:
url: 就是需要请求,并进行下一步处理的url callback: 指定该请求返回的Response,由那个函数来处理。 method: 请求一般不需要指定,默认GET方法,可设置为"GET", "POST", "PUT"等,且保证字符串大写 headers: 请求时,包含的头文件。一般不需要。内容一般如下: # 自己写过爬虫的肯定知道 Host: media.readthedocs.org User-Agent: Mozilla/5.0 (Windows NT 6.2; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0 Accept: text/css,*/*;q=0.1 Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3 Accept-Encoding: gzip, deflate Referer: http://scrapy-chs.readthedocs.org/zh_CN/0.24/ Cookie: _ga=GA1.2.1612165614.1415584110; Connection: keep-alive If-Modified-Since: Mon, 25 Aug 2014 21:59:35 GMT Cache-Control: max-age=0 meta: 比较常用,在不同的请求之间传递数据使用的。字典dict型 request_with_cookies = Request( url="http://www.example.com", cookies={'currency': 'USD', 'country': 'UY'}, meta={'dont_merge_cookies': True} ) encoding: 使用默认的 'utf-8' 就行。 dont_filter: 表明该请求不由调度器过滤。这是当你想使用多次执行相同的请求,忽略重复的过滤器。默认为False。 errback: 指定错误处理函数
Response
# 部分代码 class Response(object_ref): def __init__(self, url, status=200, headers=None, body='', flags=None, request=None): self.headers = Headers(headers or {}) self.status = int(status) self._set_body(body) self._set_url(url) self.request = request self.flags = [] if flags is None else list(flags) @property def meta(self): try: return self.request.meta except AttributeError: raise AttributeError("Response.meta not available, this response " \ "is not tied to any request")
大部分参数和上面的差不多:
status: 响应码
_set_body(body): 响应体
_set_url(url):响应url
self.request = request
发送POST请求
-
可以使用
yield scrapy.FormRequest(url, formdata, callback)
方法发送POST请求。 -
如果希望程序执行一开始就发送POST请求,可以重写Spider类的
start_requests(self)
方法,并且不再调用start_urls里的url。
class mySpider(scrapy.Spider): # start_urls = ["http://www.example.com/"] def start_requests(self): url = 'http://www.renren.com/PLogin.do' # FormRequest 是Scrapy发送POST请求的方法 yield scrapy.FormRequest( url = url, formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"}, callback = self.parse_page ) def parse_page(self, response): # do something
模拟登陆
使用FormRequest.from_response()方法模拟用户登录
通常网站通过 实现对某些表单字段(如数据或是登录界面中的认证令牌等)的预填充。
使用Scrapy抓取网页时,如果想要预填充或重写像用户名、用户密码这些表单字段, 可以使用 FormRequest.from_response() 方法实现。
renren.py
# -*- coding: utf-8 -*- import scrapy #根据cookie进行登录,实在没办法了,可以用这种方法模拟登录,麻烦一点,成功率100% class RenrenSpider(scrapy.Spider): name = "renren" allowed_domains = ["renren.com"] start_urls = ( # 'http://www.renren.com/xxxxx', 'http://www.renren.com/11111', # 'http://www.renren.com/xx', ) # 帐号登录后的cookie值 cookies = { "anonymid": "ixrna3fysufnwv", "_r01_": "1", "ap": "327550029", "JSESSIONID": "abciwg61A_RvtaRS3GjOv", "depovince": "GW", "springskin": "set", "jebe_key": "f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1486198628950", "jebe_key": "f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1486198619601", "ver": "7.0", "XNESSESSIONID": "e703b11f8809", "jebecookies": "98c7c881-779f-4da8-a57c-7464175cd469|||||", "ick_login": "4b4a254a-9f25-4d4a-b686-a41fda73e173", "_de": "BF09EE3A28DED52E6B65F6A4705D973F1383380866D39FF5", "p": "ea5541736f993365a23d04c0946c10e29", "first_login_flag": "1", "ln_uact": "mr_mao_hacker@163.com", "ln_hurl": "http://hdn.xnimg.cn/photos/hdn521/20140529/1055/h_main_9A3Z_e0c300019f6a195a.jpg", "t": "691808127750a83d33704a565d8340ae9", "societyguester": "691808127750a83d33704a565d8340ae9", "id": "327550029", "xnsid": "f42b25cf", "loginfrom": "syshome" } #开始发送网站请求时调用该方法 def start_requests(self): for url in self.start_urls: # yield scrapy.Request(url, callback = self.parse) # url = "http://www.renren.com/410043129/profile",登录进去后的页面 yield scrapy.FormRequest(url, cookies=self.cookies, callback=self.parse_page) def parse_page(self, response): print("===========" + response.url) with open("deng.html", "wb") as filename: filename.write(response.body)
renren1.py
# -*- coding: utf-8 -*- import scrapy class Reren1Spider(scrapy.Spider): name = "reren1" allowed_domains = ["renren.com"] def start_requests(self): url = 'http://www.renren.com/PLogin.do' yield scrapy.FormRequest( url=url, formdata={"email": "mr_mao_hacker@163.com", "password": "alarmchime"}, callback=self.parse_page) def parse_page(self, response): #登录后将登录页面写入到文件中 print(response.body) with open("mao2.html", "wb") as filename: filename.write(response.body)
renren2.py
# -*- coding: utf-8 -*- import scrapy # 正统模拟登录方法: # 首先发送登录页面的get请求,获取到页面里的登录必须的参数,比如说zhihu的 _xsrf # 然后和账户密码一起post到服务器,登录成功 class Renren2Spider(scrapy.Spider): name = 'renren2' allowed_domains = ['renren.com'] start_urls = ["http://www.renren.com/PLogin.do",] def parse(self, response): #验证登录是否成功 yield scrapy.FormRequest.from_response( response, formdata={"email" : "mr_mao_hacker@163.com", "password" : "alarmchime"},#, "_xsrf" = _xsrf}, callback=self.parse_page ) def parse_page(self,response): #登录成功后,点击进入好友的页面 url="http://www.renren.com/422167102/profile" yield scrapy.Request(url,callback=self.parse_newpage) def parse_newpage(self,response): with open("xiao.html", "wb") as filename: filename.write(response.body)