两种方法模拟登陆
1.直接携带cookie
import re import scrapy class RenrenSpider(scrapy.Spider): name = 'renren' allowed_domains = ['renren.com'] start_urls = ['http://renren.com/'] # 重写start_urls的处理方法,加上cookies def start_requests(self): cookies = '''anonymid=juzai6na-g0fmvf; depovince=GW; _r01_=1; ick_login=9de0dec9-4f94-42e0-819b-22df9e9adf66; ick=75ca63f4-c056-4af0-ba6e-7683bb07d04d;
jebecookies=747a7092-f53c-40ae-bc0b-90b3f9ab5e2d|||||; JSESSIONID=abcjUmG7wh1SragUBfEPw; _de=8B28AA93122391F898B641D1F469956B; p=9984be9e31957abbf89e6751ad2fd8f48;
first_login_flag=1; ln_uact=18781736136; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=59071958da717542e6a80ffd0df189c38;
societyguester=59071958da717542e6a80ffd0df189c38; id=970578188; xnsid=a1ea20ee; ver=7.0; loginfrom=null;
jebe_key=ed626104-9dc0-45aa-961c-2cfea0e1935d%7C570ae1432b36360003bbd95b7fb6661a%7C1556356655118%7C1%7C1556356654129;
wp_fold=0; XNESSESSIONID=2d1bc0ef1740; vip=1''' cookies = {i.split('=')[0]: i.split('=')[1] for i in cookies.split('; ')} start_urls = ['http://www.renren.com/970578188/profile?v=info_timeline'] yield scrapy.Request( start_urls[0], callback=self.parse_detail, cookies=cookies ) def parse_detail(self, response): res = response.xpath("//div[@class='love-infobox']/p/text()").extract_first() print(res) # print(re.findall(r'单身', response.body.decode()))
2.找到发送post请求的url地址,带上信息,发送请求 。 scrapy.FormRequest
# 模拟github网登陆 class Renren1Spider(scrapy.Spider): name = 'renren1' allowed_domains = ['github.com'] start_urls = ['http://github.com/login'] # func 1 def parse(self, response): authenticity_token = response.xpath("//input[@name='authenticity_token']/@value").extract_first() utf8 = response.xpath("//input[@name='utf8']/@value").extract_first() commit = response.xpath("//input[@name='commit']/@value").extract_first()
# 所有的form data post_data = dict( login='tangpinggui', password='***********', authenticity_token=authenticity_token, utf8=utf8, commit=commit ) yield scrapy.FormRequest( url="https://github.com/session", formdata=post_data, callback=self.after_login ) # func 2 def parse(self, response):
# 只需要登陆名及密码,其它form data数据由FormRequest.from_response 找到 post_data = dict( login='tangpinggui', password='*********', ) yield scrapy.FormRequest.from_response( response, # 自动从response找到form表单 formdata=post_data, callback=self.after_login ) def after_login(self, response): res = response.xpath("//a[@class='btn btn-outline mt-2']/text()").extract_first() print(res)
# 模拟人人网网登陆 class Renren1Spider(scrapy.Spider): name = 'renren1' allowed_domains = ['renren.com'] start_urls = ['http://renren.com'] """ email: 18781736136 icode: origURL: http://www.renren.com/970578188/profile?v=info_timeline domain: renren.com key_id: 1 captcha_type: web_login password: 6af626fe325aa7fcea5e6ff3c541404d9104667d6d941a5c5c30390c2d5da8ad rkey: 86cfb8063d4b47d05407cc549819f975 f: """ # func 1 def parse(self, response): origURL = response.xpath("//input[@name='origURL']/@value").extract_first() domain = 'renren.com' key_id = response.xpath("//input[@name='key_id']/@value").extract_first() captcha_type = response.xpath("//input[@name='captcha_type']/@value").extract_first() # rkey = response.xpath("//input[@name='rkey']/@value").extract_first() post_data = dict( email='1********', password='**********', origURL=origURL, domain=domain, key_id=key_id, captcha_type=captcha_type, # rkey='', #不知道怎么获取,貌似不要也能登录 f='' ) yield scrapy.FormRequest( url="http://www.renren.com/PLogin.do", formdata=post_data, callback=self.after_login ) # func 2 def parse(self, response): post_data = dict( email='1********', password='**************', ) yield scrapy.FormRequest.from_response( response, # 自动从response找到form表单中 formdata=post_data, callback=self.after_login ) def after_login(self, response): print('start....') with open('renren.html', 'w') as f: f.write(response.body.decode())