利用验证码登录豆瓣页面
程序已经过了好久都忘了,所以简单的整理了一下思路。
1.登录豆瓣页面时,带有的是具有图片验证码的登录方式。
首先要拿到登录页面的链接,点击(登录豆瓣)进入登录页面(首先要确保自己有一个豆瓣账号),datas里要写入自己的账号与密码才能登录
datas = {'source':'index_nav','redir':'https://www.douban.com/','form_email':'15532108480','form_password':'228yuhailong','captcha-solution':captcha,'captcha-id':captcha_id}
2.lxml爬取,F12检查找到验证码图片的位置,利用lxml提取,拿到验证码图片并写入(imgs)。
3.lxml取得验证码id---Value(YFhlJDpFopxc14NpzMJCrybn:en)
4.取得豆瓣登录页面链接,利用已有账号进行登录
url = 'https://accounts.douban.com/login'
datas ={'source':'index_nav','redir':'https://www.douban.com/','form_email':'15532108480','form_password':'228yuhailong','captcha-solution':captcha,'captcha-id':captcha_id}
data_encoded = urllib.parse.urlencode(datas).encode(encoding='utf-8')
response = oper.open(url, data_encoded)
content = response.read()
html = content.decode()
5.将html写入
spath = './douban.html'#豆瓣登陆成功后的html
f=open(spath,"w",encoding='utf-8')
f.write(html)
f.close()
6.getImg方法
def getImg(url,imgName):
try:
req_timeout = 5
req = Request(url=url,headers=headers)
f = urlopen(req,None,req_timeout)
pic = f.read()
#pic= Request.get(url, timeout=10)
imgPath = './imgs/%s.jpg'%(imgName)#图片路径存储位置
fp=open(imgPath,'wb')
fp.write(pic)
fp.close()
except Request.exceptions.ConnectionError:
print(u'链接失败')
代码时间
爬虫py文件
import urllib.request import http.cookiejar from lxml import etree from spiderImg import getImg #添加协议头 head = { 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' } def makeMyOpener(head): cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) header = [] for key, value in head.items(): elem = (key, value) header.append(elem) opener.addheaders = header return opener oper = makeMyOpener(head) uop = oper.open('https://accounts.douban.com/login', timeout = 1000) data = uop.read() html = data.decode() ''' spath = './doubanLogin.html' f=open(spath,"w",encoding='utf-8') f.write(html) f.close() ''' #print(html) #lxml提取 #得到验证码 selector = etree.HTML(html) links = selector.xpath('//img[@id="captcha_image"]/@src')#验证码图片的链接 for link in links: print(link) getImg(link,'captcs') #得到验证码id captcha_ids = selector.xpath('//input[@name="captcha-id"]/@value') captcha_id = 0 for cid in captcha_ids: captcha_id = cid print(captcha_id) captcha=input("请输入验证码:") print(captcha) url = 'https://accounts.douban.com/login' datas = {'source':'index_nav','redir':'https://www.douban.com/','form_email':'15532108480','form_password':'228yuhailong','captcha-solution':captcha,'captcha-id':captcha_id} data_encoded = urllib.parse.urlencode(datas).encode(encoding='utf-8') response = oper.open(url, data_encoded) content = response.read() html = content.decode() #print(html) spath = './douban.html'#豆瓣登陆成功后的html f=open(spath,"w",encoding='utf-8') f.write(html) f.close()
方法py文件
# -*- coding: utf-8 -*- from urllib.request import Request from urllib.request import urlopen #添加模拟浏览器协议头 headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} #url = 'https://ss0.bdstatic.com/94oJfD_bAAcT8t7mm9GUKT-xh_/timg?image&quality=100&size=b4000_4000&sec=1510537362&di=3f1f93bb6bf35c7724e3b5c435528187&src=http://www.zhlzw.com/UploadFiles/Article_UploadFiles/201204/20120412123921838.jpg' def getImg(url,imgName): try: req_timeout = 5 req = Request(url=url,headers=headers) f = urlopen(req,None,req_timeout) pic = f.read() #pic= Request.get(url, timeout=10) imgPath = './imgs/%s.jpg'%(imgName)#图片路径存储位置 fp=open(imgPath,'wb') fp.write(pic) fp.close() except Request.exceptions.ConnectionError: print(u'链接失败') # # getImg(url,'mm')
说一下方法py文件里注释的几行代码,
这个文件本身就可以拿下图片并存入文件#url是图片链接
getImg(url,‘mm’)链接和存图片时的名字
到此为止,利用验证码登录豆瓣完成(一定要注意有一个自己的账号,否则登录不成功)