基于cookie登录+验证码如何爬取
实例:
需求:获取人人网用户登录过后的个人主页数据
1 #云打码平台登录,直接下载引用就好,无需更改 2 import http.client, mimetypes, urllib, json, time, requests 3 class YDMHttp: 4 5 apiurl = 'http://api.yundama.com/api.php' 6 username = '' 7 password = '' 8 appid = '' 9 appkey = '' 10 11 def __init__(self, username, password, appid, appkey): 12 self.username = username 13 self.password = password 14 self.appid = str(appid) 15 self.appkey = appkey 16 17 def request(self, fields, files=[]): 18 response = self.post_url(self.apiurl, fields, files) 19 response = json.loads(response) 20 return response 21 22 def balance(self): 23 data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 24 response = self.request(data) 25 if (response): 26 if (response['ret'] and response['ret'] < 0): 27 return response['ret'] 28 else: 29 return response['balance'] 30 else: 31 return -9001 32 33 def login(self): 34 data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 35 response = self.request(data) 36 if (response): 37 if (response['ret'] and response['ret'] < 0): 38 return response['ret'] 39 else: 40 return response['uid'] 41 else: 42 return -9001 43 44 def upload(self, filename, codetype, timeout): 45 data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} 46 file = {'file': filename} 47 response = self.request(data, file) 48 if (response): 49 if (response['ret'] and response['ret'] < 0): 50 return response['ret'] 51 else: 52 return response['cid'] 53 else: 54 return -9001 55 56 def result(self, cid): 57 data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)} 58 response = self.request(data) 59 return response and response['text'] or '' 60 61 def decode(self, filename, codetype, timeout): 62 cid = self.upload(filename, codetype, timeout) 63 if (cid > 0): 64 for i in range(0, timeout): 65 result = self.result(cid) 66 if (result != ''): 67 return cid, result 68 else: 69 time.sleep(1) 70 return -3003, '' 71 else: 72 return cid, '' 73 74 def report(self, cid): 75 data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'} 76 response = self.request(data) 77 if (response): 78 return response['ret'] 79 else: 80 return -9001 81 82 def post_url(self, url, fields, files=[]): 83 for key in files: 84 files[key] = open(files[key], 'rb'); 85 res = requests.post(url, files=files, data=fields) 86 return res.text
1 def parse_codeImg(imgPath): 2 # 用户名 3 username = 'xxxx'#自己注册的用户名 4 5 # 密码 6 password = 'xxxx' #自己注册的密码 7 8 # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! 9 appid = 6372 10 11 # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! 12 appkey = '9b672eb204d7eede7ddeda5a87d7be08' 13 14 # 图片文件 15 filename = imgPath 16 17 # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 18 codetype = 2004 19 20 # 超时时间,秒 21 timeout = 30 22 23 # 检查 24 if (username == 'username'): 25 print('请设置好相关参数再测试') 26 else: 27 # 初始化 28 yundama = YDMHttp(username, password, appid, appkey) 29 30 # 登陆云打码 31 uid = yundama.login(); 32 print('uid: %s' % uid) 33 34 # 查询余额 35 balance = yundama.balance(); 36 print('balance: %s' % balance) 37 38 # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 39 cid, result = yundama.decode(filename, codetype, timeout); 40 print('cid: %s, result: %s' % (cid, result)) 41 return result
1 import requests 2 from lxml import etree 3 import json 4 import time 5 #创建一个session对象,会自动保存cookie 6 session=requests.session() 7 #获取人人网URL 8 url='http://www.renren.com' 9 #仿造headers 10 headers = { 11 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' 12 } 13 page_text=requests.get(url=url,headers=headers).text 14 #解析验证码图片,保存到本地 15 tree=etree.HTML(page_text) 16 code_img_url=tree.xpath('//img[@id="verifyPic_login"]/@src')[0] 17 code_img_data=requests.get(url=code_img_url,headers=headers).content 18 with open('./code.png','wb') as fp: 19 fp.write(code_img_data) 20 print("验证码存储成功!!") 21 code_text=parse_codeImg('./code.png') 22 print(code_text) 23 #登录操作,获取cookie 24 #此url以及data数据需要用Fidder工具抓包获取 25 login_url="http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=20181131725329" 26 data={ 27 "email":"18526303496", 28 "icode":code_text, 29 "origURL":"http://www.renren.com/home", 30 "domain":"renren.com", 31 "key_id":"1", 32 "captcha_type":"web_login", 33 "password":"3f06abf49c06d3f2dfce6554f070677f2459a14159d738eb08f8f7922280f5b7", 34 "rkey":"3ca02f6d93a15caf7d0c0b3637abf5a8", 35 "f":'http%3A%2F%2Fwww.renren.com%2F969092014' 36 } 37 #使用session发起请求,将cookie存储到session,保证请求成功, 38 session.post(url=login_url,headers=headers,data=data) 39 40 #进行个人主页页面的数据爬取 41 personoal_url='http://www.renren.com/969092014/profile' 42 page_text=session.get(url=personoal_url,headers=headers).text 43 with open('./renren.html','w',encoding='utf-8')as fp: 44 fp.write(page_text) 45 print('over')