网络爬虫(6)-Requests库

1.Requests库基本使用

import requests

response = requests.get("https://www.baidu.com")
print(response.status_code)
print(response.text)
print(response.cookies)
print(response.content)

使用response.text 时,Requests 会基于 HTTP 响应的文本编码自动解码响应内容,大多数 Unicode 字符集都能被无缝地解码。

使用response.content 时,返回的是服务器响应数据的原始二进制字节流,可以用来保存图片等二进制文件。

很多情况下的网站如果直接response.text会出现乱码的问题,常用解决方法如下:

方法1:

import requests

response = requests.get("https://www.baidu.com")
print(response.content.decode("utf-8"))

方法2:
import requests response = requests.get("https://www.baidu.com") response.encoding="utf-8" print(response.text)

2.Requests请求

import requests

requests.post("http://httpbin.org/post")
requests.put("http://httpbin.org/put")
requests.delete("http://httpbin.org/delete")
requests.head("http://httpbin.org/get")
requests.options("http://httpbin.org/get")

1)GET请求:

# 不带参数请求用法:

import requests

response = requests.get('http://httpbin.org/get')
print(response.text)

# 2种带参数请求用法:

- 方法1:

import requests

response = requests.get("http://httpbin.org/get?name=zhaofan&age=23")
print(response.text)

- 方法2:

import requests
data = {
"name":"zhaofan",
"age":22
}
response = requests.get("http://httpbin.org/get",params=data)
print(response.url)
print(response.text)

2)POST请求:

import requests

data = {
"name":"zhaofan",
"age":23
}
response = requests.post("http://httpbin.org/post",data=data)
print(response.text)

3.模拟登陆

1)识别验证码图片内容函数封装:

#该函数就调用了打码平台的相关的接口对指定的验证码图片进行识别,返回图片上的数据值
def getCode(codeImg):
    # 云打码平台普通用户的用户名
    username    = 'bobo328410948'
    # 云打码平台普通用户的密码
    password    = 'bobo328410948'                            
    # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
    appid       = 6003                                     
    # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
    appkey      = '1f4b564483ae5c907a1d34f8e2f2776c'    
    # 验证码图片文件
    filename    = codeImg                        
    # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
    codetype    = 3000
    # 超时时间,秒
    timeout     = 20                                    
    # 检查
    if (username == 'username'):
        print('请设置好相关参数再测试')
    else:
        # 初始化
        yundama = YDMHttp(username, password, appid, appkey)
        # 登陆云打码
        uid = yundama.login();
        print('uid: %s' % uid)
        # 查询余额
        balance = yundama.balance();
        print('balance: %s' % balance)
        # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
        cid, result = yundama.decode(filename, codetype, timeout);
        print('cid: %s, result: %s' % (cid, result))
        return result

2)模拟登录代码:

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
if __name__ == "__main__":
    #登录请求的url(通过抓包工具获取)
    post_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201873958471'
   #伪装UA
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    }
    #验证码图片显示的内容数据
    code_text= getCode('./code.jpg')
    #参数处理
    formdata = {
        'email': '17701256561',
        'icode': code_text,
        'origURL': 'http://www.renren.com/home',
        'domain': 'renren.com',
        'key_id': '1',
        'captcha_type': 'web_login',
        'password': '7b456e6c3eb6615b2e122a2942ef3845da1f91e3de075179079a3b84952508e4',
        'rkey': '44fd96c219c593f3c9612360c80310a3',
        'f': 'https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dm7m_NSUp5Ri_ZrK5eNIpn_dMs48UAcvT-N_kmysWgYW%26wd%3D%26eqid%3Dba95daf5000065ce000000035b120219',
    }
    #模拟登录请求发送
    response = requests.post(url=post_url,data=formdata,headers=headers)
    #设置响应内容的编码格式
    response.encoding = 'utf-8'
    #将响应内容写入文件
    with open('./renren.html','w') as fp:
        fp.write(response.text)

4.Cookie处理

import requests
from lxml import etree
headers = {
   'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
#创建会话对象,该会话对象可以调用get和post发起请求
session = requests.Session()
#使用会话对面对登录页面发起请求
page_text = session.get(url='https://github.com/login',headers=headers).text
#解析出动态的taken值
tree = etree.HTML(page_text)
t = tree.xpath('//*[@id="login"]/form/input[2]/@value')[0]
#指定模拟登录请求的url
url = 'https://github.com/session'
#参数封装(处理动态taken值)
data = {
   'commit': 'Sign in',
   'utf8': '',
   'authenticity_token': t,
   'login': 'bobo328410948@sina.com',
   'password': 'bobo@15027900535',
   'webauthn-support': 'supported',
}
#使用会话对象进行模拟登录请求发送(携带cookie)
page_text = session.post(url=url,headers=headers,data=data).text
#持久化存储
with open('./git.html','w',encoding='utf-8') as fp:
  fp.write(page_text)

5.代理

import requests
import random
if __name__ == "__main__":
    #不同浏览器的UA
    header_list = [
        {"user-agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)"},    # 遨游
        {"user-agent": "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"},    # 火狐
        {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}    # 谷歌
    ]
    #不同的代理IP
    proxy_list = [
        {"http": "112.115.57.20:3128"},
        {'http': '121.41.171.223:3128'}
    ]
    #随机获取UA和代理IP
    header = random.choice(header_list)
    proxy = random.choice(proxy_list)
    url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'
    #参数3:设置代理
    response = requests.get(url=url,headers=header,proxies=proxy)
    response.encoding = 'utf-8'
    with open('daili.html', 'wb') as fp:
        fp.write(response.content)

6.异常处理

import requests
from requests.exceptions import ReadTimeout,ConnectionError,RequestException

try:
    response = requests.get("http://httpbin.org/get",timout=0.1)
    print(response.status_code)
except ReadTimeout:
    print("timeout")
except ConnectionError:
    print("connection Error")
except RequestException:
    print("error")
posted @ 2019-08-01 23:42  麦小秋  阅读(306)  评论(0编辑  收藏  举报