基于cookie登录+验证码如何爬取

实例:

需求:获取人人网用户登录过后的个人主页数据

 1 #云打码平台登录,直接下载引用就好,无需更改
 2 import http.client, mimetypes, urllib, json, time, requests
 3 class YDMHttp:
 4 
 5     apiurl = 'http://api.yundama.com/api.php'
 6     username = ''
 7     password = ''
 8     appid = ''
 9     appkey = ''
10 
11     def __init__(self, username, password, appid, appkey):
12         self.username = username  
13         self.password = password
14         self.appid = str(appid)
15         self.appkey = appkey
16 
17     def request(self, fields, files=[]):
18         response = self.post_url(self.apiurl, fields, files)
19         response = json.loads(response)
20         return response
21     
22     def balance(self):
23         data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
24         response = self.request(data)
25         if (response):
26             if (response['ret'] and response['ret'] < 0):
27                 return response['ret']
28             else:
29                 return response['balance']
30         else:
31             return -9001
32     
33     def login(self):
34         data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
35         response = self.request(data)
36         if (response):
37             if (response['ret'] and response['ret'] < 0):
38                 return response['ret']
39             else:
40                 return response['uid']
41         else:
42             return -9001
43 
44     def upload(self, filename, codetype, timeout):
45         data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
46         file = {'file': filename}
47         response = self.request(data, file)
48         if (response):
49             if (response['ret'] and response['ret'] < 0):
50                 return response['ret']
51             else:
52                 return response['cid']
53         else:
54             return -9001
55 
56     def result(self, cid):
57         data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)}
58         response = self.request(data)
59         return response and response['text'] or ''
60 
61     def decode(self, filename, codetype, timeout):
62         cid = self.upload(filename, codetype, timeout)
63         if (cid > 0):
64             for i in range(0, timeout):
65                 result = self.result(cid)
66                 if (result != ''):
67                     return cid, result
68                 else:
69                     time.sleep(1)
70             return -3003, ''
71         else:
72             return cid, ''
73 
74     def report(self, cid):
75         data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
76         response = self.request(data)
77         if (response):
78             return response['ret']
79         else:
80             return -9001
81 
82     def post_url(self, url, fields, files=[]):
83         for key in files:
84             files[key] = open(files[key], 'rb');
85         res = requests.post(url, files=files, data=fields)
86         return res.text
云打码平台代码
 1 def parse_codeImg(imgPath):
 2     # 用户名
 3     username    = 'xxxx'#自己注册的用户名
 4 
 5     # 密码
 6     password    = 'xxxx'    #自己注册的密码                     
 7 
 8     # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
 9     appid       = 6372                                     
10 
11     # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
12     appkey      = '9b672eb204d7eede7ddeda5a87d7be08'    
13 
14     # 图片文件
15     filename    = imgPath                       
16 
17     # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
18     codetype    = 2004
19 
20     # 超时时间,秒
21     timeout     = 30                                    
22 
23     # 检查
24     if (username == 'username'):
25         print('请设置好相关参数再测试')
26     else:
27         # 初始化
28         yundama = YDMHttp(username, password, appid, appkey)
29 
30         # 登陆云打码
31         uid = yundama.login();
32         print('uid: %s' % uid)
33 
34         # 查询余额
35         balance = yundama.balance();
36         print('balance: %s' % balance)
37 
38         # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
39         cid, result = yundama.decode(filename, codetype, timeout);
40         print('cid: %s, result: %s' % (cid, result))
41         return result
云打码平台代码2
 1 import requests
 2 from lxml import etree
 3 import json
 4 import time
 5 #创建一个session对象,会自动保存cookie
 6 session=requests.session()
 7 #获取人人网URL
 8 url='http://www.renren.com'
 9 #仿造headers
10 headers = {
11     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
12 }
13 page_text=requests.get(url=url,headers=headers).text
14 #解析验证码图片,保存到本地
15 tree=etree.HTML(page_text)
16 code_img_url=tree.xpath('//img[@id="verifyPic_login"]/@src')[0]
17 code_img_data=requests.get(url=code_img_url,headers=headers).content
18 with open('./code.png','wb') as fp:
19     fp.write(code_img_data)
20     print("验证码存储成功!!")
21 code_text=parse_codeImg('./code.png')
22 print(code_text)
23 #登录操作,获取cookie
24 #此url以及data数据需要用Fidder工具抓包获取
25 login_url="http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=20181131725329"
26 data={
27     "email":"18526303496",
28     "icode":code_text,
29     "origURL":"http://www.renren.com/home",
30     "domain":"renren.com",
31     "key_id":"1",
32     "captcha_type":"web_login",
33     "password":"3f06abf49c06d3f2dfce6554f070677f2459a14159d738eb08f8f7922280f5b7",
34     "rkey":"3ca02f6d93a15caf7d0c0b3637abf5a8",
35     "f":'http%3A%2F%2Fwww.renren.com%2F969092014'   
36 }
37 #使用session发起请求,将cookie存储到session,保证请求成功,
38 session.post(url=login_url,headers=headers,data=data)
39 
40 #进行个人主页页面的数据爬取
41 personoal_url='http://www.renren.com/969092014/profile'
42 page_text=session.get(url=personoal_url,headers=headers).text
43 with open('./renren.html','w',encoding='utf-8')as fp:
44     fp.write(page_text)
45     print('over')
主代码

 

posted @ 2018-12-12 20:45  北伽  阅读(656)  评论(0编辑  收藏  举报