2019/6/17

今日内容:

1.requests之POST请求

2.requests高级用法

3.selenium模块

4.万能破解登录

 上节课作业正确内容

# 电影详情页url、图片链接、电影名称、导演、主演、电影上映时间、电影评分、评价人数、简介
   <div class="item">.*?href="(.*?)">.*?src="(.*?)" class="">.*?<span class="title">(.*?)</span>.*?<div class="bd">.*?导演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?<span class="inq">(.*?)</span>

base_url = 'https://movie.douban.com/top250?start = {}&filter = '

 n = 0

for line in range (10)  :

print ()

ur; = base_url.format(n)

print (type(n))

n+=25

print(url)

#1、往豆瓣TOP250发送请求获取响应数据

response = requests.get(url.headers = headers)

 

#解压复制每一部电影

detail_url,movie_jpg,name,daoyan,timer,point,num,desc = movie_content

data = f'电影名称:{name},  详情页url:{detail_url},图片url:{movie_jpg}'

导演: {daoyan}  上映时间:{timer} 评分:{point}, 评分人数:{num 简介:{desc}、n'

print(data)

请求URL

http://gitbub.com/sossion

 

联系方式:

POST

 

请求头:

Rofeier: http://github.com/login

User-Agent:mozilla/5.0 (window NT 10.0: wow64) Applowobkit?

 

请求体:

只有POST请求才会有请求体

commit: sign in

utf8:

autbenticiy_token: vx79esfcosfdgs/sdfsfsfgsfg/sfs

login: tank.jaml

password:kemit32423424

webauthn-support:unsuported

 

'  '  '

请求url

https://githuh.con/login

请求方式:

GET

响应头:

Set-Cookie

请求头:

cookie

user-agent

'  '  '

 

headers = {

'user-agent':,'mozilla/5.0(windows nt 10.0: wow64)  applewebkit/535.36(khtml,)}

 

response = requests.get(url = 'https: //github.com/login', headers=headers)

#print(response.text)

 

authenticity_token = re.findall('<input type = "hidden"  name = "authenaticity_token" value = "(.*?)"/>'),response.text, re.s)  [0]

 

print(authenticity_token)

''' post请求访问github cookie有时间限制 请求URL: 请求方式:post 请求头referer:(上一个页面地址)        user_agent: 请求体:(只有post有)
''' #访问login页获取token信息
''' 请求URL:     http://github.com/login 请求方式:     get 响应头:set——cookie:(服务端反诬,告诉浏览器,要设置session) 请求头:     cookie     user—agent ''' import requests import re headers={...... } response=requests.get('dizhi',headers=headers) login_cookies=response.cookie.get_dict()
token=re.findall('zhengzeguize',response.tex,re.S)[0] print(token)    #找到第一个页面的token
#登录  往session发送post请求 #请求的URL=http://github.com/session #携带请求头、请求体、login页的cookies信息 headers2={     'referer':'',     'user-agent':'',     #cookie } #拼接请求体 form={     请求头里面的东西(其中authenticity_token:token)#上面获取第一个页面的token } res=requests.post('dizhi',data=form,headers=headers2,cookies=login_cookies)#访问第一个页面返回的cookies with open ('gui.html','w',encoding='utf-8') as f:     f.write(res.text)
 
response = requests.get(url = 'http://github.com/lopgin',headers = headers)
#print(response.text)
#把login页返回的cookies信息转换成字典
login_cookies = response.cookies.get_dict()
authenticity_token  = re.findall('<input  type = "hidden">')name = "authenticity_token" value = "(.*?)"/>', response.text,re.S) [0]
 
print(autjeticity_token)
 

 import requests

r = requests.get('https://api.github.com/events') 

r = requests.post('http://httpbin.org/post', data = {'key':'value'}) 

r = requests.put('http://httpbin.org/put', data = {'key':'value'})

r = requests.delete('http://httpbin.org/delete')

r = requests.head('http://httpbin.org/get')

r = requests.options('http://httpbin.org/get')

 

from urllib.parse import urlencode import requests

# q后面携带的是中文墨菲定律 response1 = requests.get('https://list.tmall.com/search_product.htm?q=%C4%AB%B7%C6%B6%A8%C2%C9') print(response1.text)

# 因为字符编码的问题,以至于中文变成了一些特殊的字符,所以我们要找到一种解决方案 url = 'https://list.tmall.com/search_product.htm?' + urlencode({'q': '墨菲定律'}) response2 = requests.get(url) print(response2.text)

# get方法为我们提供了一个参数params,它内部其实就是urlencode response3 = requests.get('https://list.tmall.com/search_product.htm?', params={"q": "墨菲定律"}) print(response3.text)

 

import requests

headers = {     'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36', }

response = requests.get('https://www.github.com', headers=headers)

# response响应 print(response.status_code) 

# 获取响应状态码 print(response.url) 

# 获取url地址 print(response.text) 

# 获取文本 print(response.content) 

# 获取二进制流 print(response.headers) 

# 获取页面请求头信息 print(response.history) 

# 上一次跳转的地址 print(response.cookies) 

# # 获取cookies信息 print(response.cookies.get_dict()) 

# 获取cookies信息转换成字典 print(response.cookies.items()) 

# 获取cookies信息转换成字典 print(response.encoding) 

# 字符编码 print(response.elapsed) 

# 访问时间

 

import requests

headers = {     'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36', }

# 编码问题 response = requests.get('http://www.autohome.com/news', headers=headers) # print(response.text)

# 汽车之家网站返回的页面内容编码为gb2312,而requests的默认编码为ISO-8859-1,如果不设置成gbk则中文乱码 response.encoding = 'gbk' print(response.text)

 

import requests

#往音频地址发送get请求

url = 'http://vd3.badstatic.com/mda-ic4pfhh32svqi/hd/mda-ic4pfhh3eh3ex32svqi.mp4?auth_key = 123123452345444dsfvjfsngswrut834u rgwrghrgububbn'

resp[onse = requests.get(url, stream = True)

#stream = True 把content设置为一个迭代器对象

print (response.content)

with open('love_for_GD.mp4','wb') as f :

for content in response.iter_content():

f.write(content)

 

# 代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情)

import requests proxies={    

# 带用户名密码的代理,@符号前是用户名与密码    

'http':'http://tank:123@localhost:9527',    

'http':'http://localhost:9527',    

'https':'https://localhost:9527', } response=requests.get('https://www.12306.cn',                     

proxies=proxies)

print(response.status_code)

# 支持socks代理,安装:pip install requests[socks] import requests proxies = {    

'http': 'socks5://user:pass@host:port',    

'https': 'socks5://user:pass@host:port' } respone=requests.get('https://www.12306.cn',                     

proxies=proxies)

print(respone.status_code)

 

 

# from selenium import webdriver  # 用来驱动浏览器的 # import time # # ''' # 隐式等待 # ''' # # 获取驱动对象、 # driver = webdriver.Chrome() # # try: #     # 显式等待: 等待某个元素加载 #     # 参数1: 驱动对象  参数2: 等待时间 #     # wait = WebDriverWait(chrome, 10) # #     driver.get('https://china.nba.com/') # #     # 隐式等待: 等待页面所有元素加载 #     driver.implicitly_wait(10) #     news_tag = driver.find_element_by_class_name('nav-news') #     # 获取标签对象 #     print(news_tag) #     # 获取标签的名字 #     print(news_tag.tag_name) # # #     time.sleep(10) # # finally: #     driver.close()   from selenium import webdriver  # 用来驱动浏览器的 import time
''' ===============所有方法=================== element是查找一个标签 elements是查找所有标签 1、find_element_by_link_text 通过链接文本去找 2、find_element_by_id 通过id去找 3、find_element_by_class_name 4、find_element_by_partial_link_text 5、find_element_by_name 6、find_element_by_css_selector 7、find_element_by_tag_name ''' # 获取驱动对象、 driver = webdriver.Chrome()
try:
# 往百度发送请求 driver.get('https://www.baidu.com/') driver.implicitly_wait(10)
# 1、find_element_by_link_text 通过链接文本去找 # 根据登录 # send_tag = driver.find_element_by_link_text('登录') # send_tag.click() # 2、find_element_by_partial_link_text 通过局部文本查找a标签 login_button = driver.find_element_by_partial_link_text('登') login_button.click() time.sleep(1)
# 3、find_element_by_class_name 根据class属性名查找 login_tag = driver.find_element_by_class_name('tang-pass-footerBarULogin') login_tag.click() time.sleep(1)
# 4、find_element_by_name 根据name属性查找 username = driver.find_element_by_name('userName') username.send_keys('15622792660') time.sleep(1)
# 5、find_element_by_id 通过id属性名查找 password = driver.find_element_by_id('TANGRAM__PSP_10__password') password.send_keys('*******') time.sleep(1)
# 6、find_element_by_css_selector 根据属性选择器查找 # 根据id查找登录按钮 login_submit = driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit') # driver.find_element_by_css_selector('.pass-button-submit') login_submit.click()
# 7、find_element_by_tag_name 根据标签名称查找标签 div = driver.find_element_by_tag_name('div') print(div.tag_name)
time.sleep(10)
finally: driver.close()

 

''' ''' ''' 证书验证(大部分网站都是https) ''' import requests # # 如果是ssl请求,首先检查证书是否合法,不合法则报错,程序终端 # response = requests.get('https://www.xiaohuar.com') # print(response.status_code)  # 改进1:去掉报错,但是会报警告 # import requests # response = requests.get('https://www.xiaohuar.com', verify=False) # # 不验证证书,报警告,返回200 # print(response.status_code)  # 改进2:去掉报错,并且去掉警报信息 # import requests # import urllib3 # urllib3.disable_warnings()  # 关闭警告 # response = requests.get('https://www.xiaohuar.com', verify=False) # print(response.status_code)  # 改进3:加上证书 # 很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书 # 知乎\百度等都是可带可不带 # 有硬性要求的,则必须带,比如对于定向的用户,拿到证书后才有权限访问某个特定网站 # import requests # import urllib3 # # urllib3.disable_warnings()  # 关闭警告 # # 伪代码 # response = requests.get( #     'https://www.xiaohuar.com', #     # verify=False, #     # /path/server.crt证书的存放目录, /path/key #     cert=('/path/server.crt', '/path/key')) # print(response.status_code)   ''' 超时设置 '''  # 超时设置 # 两种超时:float or tuple # timeout=0.1  # 代表接收数据的超时时间 # timeout=(0.1,0.2)  # 0.1代表链接超时  0.2代表接收数据的超时时间  # import requests # response = requests.get('https://www.baidu.com', #                         timeout=0.0001) # # print(response.elapsed) # print(response.status_code)  ''' 代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情) ''' # import requests # proxies={ #     # 带用户名密码的代理,@符号前是用户名与密码 #     'http':'http://tank:123@localhost:9527', #     'http':'http://localhost:9527', #     'https':'https://localhost:9527', # } # response=requests.get('https://www.12306.cn', #                      proxies=proxies) # # print(response.status_code) ''' 爬取西刺免费代理:     1.访问西刺免费代理页面     2.通过re模块解析并提取所有代理     3.通过ip测试网站对爬取的代理进行测试     4.若test_ip函数抛出异常代表代理作废,否则代理有效     5.利用有效的代理进行代理测试  <tr class="odd">       <td class="country"><img src="//fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td>       <td>112.85.131.99</td>       <td>9999</td>       <td>         <a href="/2019-05-09/jiangsu">江苏南通</a>       </td>       <td class="country">高匿</td>       <td>HTTPS</td>       <td class="country">         <div title="0.144秒" class="bar">           <div class="bar_inner fast" style="width:88%">            </div>         </div>       </td>       <td class="country">         <div title="0.028秒" class="bar">           <div class="bar_inner fast" style="width:97%">            </div>         </div>       </td>        <td>6天</td>       <td>19-05-16 11:20</td>     </tr> re:     <tr class="odd">(.*?)</td>.*?<td>(.*?)</td>  ''' # import requests # import re # import time # # HEADERS = { #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', # } # # # def get_index(url): #     time.sleep(1) #     response = requests.get(url, headers=HEADERS) #     return response # # # def parse_index(text): #     ip_list = re.findall('<tr class="odd">.*?<td>(.*?)</td>.*?<td>(.*?)</td>', text, re.S) #     for ip_port in ip_list: #         ip = ':'.join(ip_port) #         yield ip # # def test_ip(ip): #     print('测试ip: %s' % ip) #     try: #         proxies = { #             'https': ip #         } # #         # ip测试网站 #         ip_url = 'https://www.ipip.net/' # #         # 使用有效与无效的代理对ip测试站点进行访问,若返回的结果为200则代表当前测试ip正常 #         response = requests.get(ip_url, headers=HEADERS, proxies=proxies, timeout=1) # #         if response.status_code == 200: #             print(f'有用的ip:{ip}') #             return ip # #     # 若ip代理无效则抛出异常 #     except Exception as e: #         print(e) # # # 使用代理爬取nba # def spider_nba(good_ip): #     url = 'https://china.nba.com/' # #     proxies = { #         'https': good_ip #     } # #     response = requests.get(url, headers=HEADERS, proxies=proxies) #     print(response.status_code) #     print(response.text) # # # if __name__ == '__main__': #     base_url = 'https://www.xicidaili.com/nn/{}' # #     for line in range(1, 3677): #         ip_url = base_url.format(line) # #         response = get_index(ip_url) # #         # 解析西刺代理获取每一个ip列表 #         ip_list = parse_index(response.text) # #         # 循环每一个ip #         for ip in ip_list: #             # print(ip) # #             # 对爬取下来的ip进行测试 #             good_ip = test_ip(ip) # #             if good_ip: #                 # 真是代理,开始测试 #                 spider_nba(good_ip)    ''' 认证设置 ''' import requests # 通过访问github的api来测试 url = 'https://api.github.com/user' HEADERS = {     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', }
# 测试1,失败返回401 # response = requests.get(url, headers=HEADERS) # print(response.status_code) # 401 # print(response.text) ''' 打印结果: { "message": "Requires authentication", "documentation_url": "https://developer.github.com/v3/users/#get-the-authenticated-user" } ''' # # # 测试2,通过requests.auth内的HTTPBasicAuth进行认证,认证成功返回用户信息 # from requests.auth import HTTPBasicAuth # response = requests.get(url, headers=HEADERS, auth=HTTPBasicAuth('tankjam', 'kermit46709394')) # print(response.text) # # 测试3,通过requests.get请求内的auth参数默认就是HTTPBasicAuth,认证成功返回用户信息 # response = requests.get(url, headers=HEADERS, auth=('tankjam', 'kermit46709394')) # print(response.text) ''' 上传文件 ''' import requests
# 上传文本文件 # files1 = {'file': open('user.txt', 'rb')} # # files参数是POST请求固定参数 # response = requests.post('http://httpbin.org/post', files=files1) # print(response.status_code) # 200 # print(response.text) # 200 # 上传图片文件 # files2 = {'jpg': open('一拳.jpg', 'rb')} # response = requests.post('http://httpbin.org/post', files=files2) # print(response.status_code) # 200 # print(response.text) # 200 # # 上传视频文件 # files3 = {'movie': open('love_for_GD.mp4', 'rb')} # response = requests.post('http://httpbin.org/post', files=files3) # print(response.status_code) # 200 # print(response.text) # 200

 

 今日作业:

1.整理课堂笔记并编写博客

2.爬取代理

(参考爬取西刺代理代码)

    https://www.kuaidaili.com/free/

3.熟悉selenium模块,敲课上例子

4.自动登录抽屉新热榜

 爬取快代理

'' 爬取快代理: 
    1.访问快代理页面 
    2.通过re模块解析并提取所有代理 
    3.通过ip测试网站对爬取的代理进行测试
    4.若test_ip函数抛出异常代表代理作废,否则代理有效 
    5.利用有效的代理进行代理测试 
    <tr>
                <td data-title="IP">124.205.143.212</td>
                <td data-title="PORT">40585</td>
                <td data-title="匿名度">高匿名</td>
                <td data-title="类型">HTTP</td>
                <td data-title="位置">北京市北京市 鹏博士宽带</td>
                <td data-title="响应速度">2秒</td>
                <td data-title="最后验证时间">2019-06-17 16:30:54</td>
                </tr>
    re:
        <tr>.*?<td data-title="IP">(.*?)</td>.*?<td data-title="PORT">(.*?)</td>
'''
''' 
页面链接 
第一页: 
    https://www.kuaidaili.com/free/ 
第二页: 
    https://www.kuaidaili.com/free/inha/2/ 
'''
import requests
import re
import time
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}
def get_index(url):
    time.sleep(1)
    response1 = requests.get(url, headers=headers)
    return response1
def parse_index(text):
    ip_list1 = re.findall('<tr>.*?<td data-title="IP">(.*?)</td>.*?<td data-title="PORT">(.*?)</td>', text, re.S)
    for ip_port in ip_list1:
        ip1 = ':'.join(ip_port)
        yield ip1
def test_ip(ip2):
    print('测试ip: %s' % ip2)
    try:
        proxies = {'https': ip2}
        # ip测试网站
        ip_url1 = 'https://www.ipip.net/'
        # 使用有效与无效的代理对ip测试站点进行访问,若返回的结果为200则代表当前测试ip正常
        response2 = requests.get(ip_url1, headers=headers, proxies=proxies, timeout=1)
        if response2.status_code == 200:
            return ip
        # 若ip代理无效则抛出异常
    except Exception as e:
            print(e)
        # 使用代理爬取nba
    def spider_nba(good_ip1):
        url = 'https://china.nba.com/'
        proxies = {'https': good_ip1}
        response3 = requests.get(url, headers=headers, proxies=proxies)

        print(response3.status_code)
        print(response3.text)

    if __name__ == '__main__':
        base_url = 'https://www.kuaidaili.com/free/inha/{}/'
        for line in range(1, 2905):
            ip_url = base_url.format(line)
            response = get_index(ip_url)
            ip_list = parse_index(response.text)
            for ip in ip_list:
                good_ip = test_ip(ip)
                if good_ip:
                    spider_nba(good_ip)
View Code

自动登录抽屉新热榜

from selenium import webdriver
import time
#获取驱动对象
driver = webdriver.Chrome()
try:
 #自动登陆抽屉新热榜
 #发送get请求
    driver.get('https://dig.chouti.com/ ')
 #隐式等待
    driver.implicitly_wait(10)
     #获取 '登陆' 按钮
    send_tag = driver.find_element_by_id('login_btn')
    send_tag.click()
     #获取手机号输入框
    username = driver.find_element_by_class_name('login-phone')
    username.send_keys('***********')
    time.sleep(1)
     #获取密码输入框
    password = driver.find_element_by_class_name('pwd-password-input')
    password.send_keys('***********')
    time.sleep(1)
     #获取 '登陆' 按钮
    login = driver.find_elements_by_link_text('登录')
    login[1].click()
    time.sleep(10)
finally:
    driver.close()
View Code

 

 

 

posted @ 2019-06-17 10:24  邵贤伟  阅读(281)  评论(0编辑  收藏  举报